From 099f53cb50e45ef617a9f1d63ceec799e489418b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 8 Apr 2009 14:28:37 -0700 Subject: async_tx: rename zero_sum to val 'zero_sum' does not properly describe the operation of generating parity and checking that it validates against an existing buffer. Change the name of the operation to 'val' (for 'validate'). This is in anticipation of the p+q case where it is a requirement to identify the target parity buffers separately from the source buffers, because the target parity buffers will not have corresponding pq coefficients. Reviewed-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- include/linux/async_tx.h | 2 +- include/linux/dmaengine.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 5fc2ef8d97fa..513150d8c25b 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -117,7 +117,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, dma_async_tx_callback cb_fn, void *cb_fn_param); struct dma_async_tx_descriptor * -async_xor_zero_sum(struct page *dest, struct page **src_list, +async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, int src_cnt, size_t len, u32 *result, enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 2e2aa3df170c..6768727d00d7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -55,8 +55,8 @@ enum dma_transaction_type { DMA_PQ_XOR, DMA_DUAL_XOR, DMA_PQ_UPDATE, - DMA_ZERO_SUM, - DMA_PQ_ZERO_SUM, + DMA_XOR_VAL, + DMA_PQ_VAL, DMA_MEMSET, DMA_MEMCPY_CRC32C, DMA_INTERRUPT, @@ -214,7 +214,7 @@ struct dma_async_tx_descriptor { * @device_free_chan_resources: release DMA channel's resources * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation - * @device_prep_dma_zero_sum: prepares a zero_sum operation + * @device_prep_dma_xor_val: prepares a xor validation operation * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation @@ -243,7 +243,7 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_xor)( struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( + struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, size_t len, u32 *result, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( -- cgit v1.2.3-71-gd317 From 88ba2aa586c874681c072101287e15d40de7e6e2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 9 Apr 2009 16:16:18 -0700 Subject: async_tx: kill ASYNC_TX_DEP_ACK flag In support of inter-channel chaining async_tx utilizes an ack flag to gate whether a dependent operation can be chained to another. While the flag is not set the chain can be considered open for appending. Setting the ack flag closes the chain and flags the descriptor for garbage collection. The ASYNC_TX_DEP_ACK flag essentially means "close the chain after adding this dependency". Since each operation can only have one child the api now implicitly sets the ack flag at dependency submission time. This removes an unnecessary management burden from clients of the api. [ Impact: clean up and enforce one dependency per operation ] Reviewed-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- Documentation/crypto/async-tx-api.txt | 9 ++++----- crypto/async_tx/async_memcpy.c | 2 +- crypto/async_tx/async_memset.c | 2 +- crypto/async_tx/async_tx.c | 4 ++-- crypto/async_tx/async_xor.c | 6 ++---- drivers/md/raid5.c | 25 +++++++++++-------------- include/linux/async_tx.h | 4 +--- 7 files changed, 22 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index 4af12180d191..76feda8541dc 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -80,8 +80,8 @@ acknowledged by the application before the offload engine driver is allowed to recycle (or free) the descriptor. A descriptor can be acked by one of the following methods: 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted -2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent - descriptor of a new operation. +2/ submitting an unacknowledged descriptor as a dependency to another + async_tx call will implicitly set the acknowledged state. 3/ calling async_tx_ack() on the descriptor. 3.4 When does the operation execute? @@ -136,10 +136,9 @@ int run_xor_copy_xor(struct page **xor_srcs, tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL); - tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, - ASYNC_TX_DEP_ACK, tx, NULL, NULL); + tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, tx, NULL, NULL); tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, - ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, + ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx, complete_xor_copy_xor, NULL); async_tx_issue_pending_all(); diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c index ddccfb01c416..7117ec6f1b74 100644 --- a/crypto/async_tx/async_memcpy.c +++ b/crypto/async_tx/async_memcpy.c @@ -35,7 +35,7 @@ * @src: src page * @offset: offset in pages to start transaction * @len: length in bytes - * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, + * @flags: ASYNC_TX_ACK * @depend_tx: memcpy depends on the result of this transaction * @cb_fn: function to call when the memcpy completes * @cb_param: parameter to pass to the callback routine diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c index 5b5eb99bb244..b2f133885b7f 100644 --- a/crypto/async_tx/async_memset.c +++ b/crypto/async_tx/async_memset.c @@ -35,7 +35,7 @@ * @val: fill value * @offset: offset in pages to start transaction * @len: length in bytes - * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @flags: ASYNC_TX_ACK * @depend_tx: memset depends on the result of this transaction * @cb_fn: function to call when the memcpy completes * @cb_param: parameter to pass to the callback routine diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index 06eb6cc09fef..3766bc3d7d89 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -223,7 +223,7 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, if (flags & ASYNC_TX_ACK) async_tx_ack(tx); - if (depend_tx && (flags & ASYNC_TX_DEP_ACK)) + if (depend_tx) async_tx_ack(depend_tx); } EXPORT_SYMBOL_GPL(async_tx_submit); @@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(async_tx_submit); /** * async_trigger_callback - schedules the callback function to be run after * any dependent operations have been completed. - * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @flags: ASYNC_TX_ACK * @depend_tx: 'callback' requires the completion of this transaction * @cb_fn: function to call after depend_tx completes * @cb_param: parameter to pass to the callback routine diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index e0580b0ea533..3cc5dc763b54 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -105,7 +105,6 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, _cb_param); depend_tx = tx; - flags |= ASYNC_TX_DEP_ACK; if (src_cnt > xor_src_cnt) { /* drop completed sources */ @@ -168,8 +167,7 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, * @offset: offset in pages to start transaction * @src_cnt: number of source pages * @len: length in bytes - * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, - * ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, ASYNC_TX_ACK * @depend_tx: xor depends on the result of this transaction. * @cb_fn: function to call when the xor completes * @cb_param: parameter to pass to the callback routine @@ -230,7 +228,7 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len) * @src_cnt: number of source pages * @len: length in bytes * @result: 0 if sum == 0 else non-zero - * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @flags: ASYNC_TX_ACK * @depend_tx: xor depends on the result of this transaction. * @cb_fn: function to call when the xor completes * @cb_param: parameter to pass to the callback routine diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8d2d35ed298..0ef5362c8d02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -525,14 +525,12 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, bio_page = bio_iovec_idx(bio, i)->bv_page; if (frombio) tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); + b_offset, clen, 0, + tx, NULL, NULL); else tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); + page_offset, clen, 0, + tx, NULL, NULL); } if (clen < len) /* hit end of page */ break; @@ -615,8 +613,7 @@ static void ops_run_biofill(struct stripe_head *sh) } atomic_inc(&sh->count); - async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, - ops_complete_biofill, sh); + async_trigger_callback(ASYNC_TX_ACK, tx, ops_complete_biofill, sh); } static void ops_complete_compute5(void *stripe_head_ref) @@ -701,8 +698,8 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh); + ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh); return tx; } @@ -809,7 +806,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | + flags = ASYNC_TX_ACK | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); atomic_inc(&sh->count); @@ -858,7 +855,7 @@ static void ops_run_check(struct stripe_head *sh) &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); atomic_inc(&sh->count); - tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, + tx = async_trigger_callback(ASYNC_TX_ACK, tx, ops_complete_check, sh); } @@ -2687,8 +2684,8 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, /* place all the copies on one channel */ tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, - ASYNC_TX_DEP_ACK, tx, NULL, NULL); + sh->dev[i].page, 0, 0, STRIPE_SIZE, + 0, tx, NULL, NULL); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 513150d8c25b..9f14cd540cd2 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -58,13 +58,11 @@ struct dma_chan_ref { * array. * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a * dependency chain - * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), - ASYNC_TX_ACK = (1 << 3), - ASYNC_TX_DEP_ACK = (1 << 4), + ASYNC_TX_ACK = (1 << 2), }; #ifdef CONFIG_DMA_ENGINE -- cgit v1.2.3-71-gd317 From a08abd8ca890a377521d65d493d174bebcaf694b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 3 Jun 2009 11:43:59 -0700 Subject: async_tx: structify submission arguments, add scribble Prepare the api for the arrival of a new parameter, 'scribble'. This will allow callers to identify scratchpad memory for dma address or page address conversions. As this adds yet another parameter, take this opportunity to convert the common submission parameters (flags, dependency, callback, and callback argument) into an object that is passed by reference. Also, take this opportunity to fix up the kerneldoc and add notes about the relevant ASYNC_TX_* flags for each routine. [ Impact: moves api pass-by-value parameters to a pass-by-reference struct ] Signed-off-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- Documentation/crypto/async-tx-api.txt | 6 +- crypto/async_tx/async_memcpy.c | 26 +++---- crypto/async_tx/async_memset.c | 25 +++---- crypto/async_tx/async_tx.c | 51 +++++++------- crypto/async_tx/async_xor.c | 123 +++++++++++++++++----------------- drivers/md/raid5.c | 59 +++++++++------- include/linux/async_tx.h | 84 ++++++++++++++--------- 7 files changed, 200 insertions(+), 174 deletions(-) (limited to 'include/linux') diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index 76feda8541dc..dfe0475f7919 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -54,11 +54,7 @@ features surfaced as a result: 3.1 General format of the API: struct dma_async_tx_descriptor * -async_(, - enum async_tx_flags flags, - struct dma_async_tx_descriptor *dependency, - dma_async_tx_callback callback_routine, - void *callback_parameter); +async_(, struct async_submit ctl *submit) 3.2 Supported operations: memcpy - memory copy between a source and a destination buffer diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c index 7117ec6f1b74..89e05556f3df 100644 --- a/crypto/async_tx/async_memcpy.c +++ b/crypto/async_tx/async_memcpy.c @@ -33,28 +33,28 @@ * async_memcpy - attempt to copy memory with a dma engine. * @dest: destination page * @src: src page - * @offset: offset in pages to start transaction + * @dest_offset: offset into 'dest' to start transaction + * @src_offset: offset into 'src' to start transaction * @len: length in bytes - * @flags: ASYNC_TX_ACK - * @depend_tx: memcpy depends on the result of this transaction - * @cb_fn: function to call when the memcpy completes - * @cb_param: parameter to pass to the callback routine + * @submit: submission / completion modifiers + * + * honored flags: ASYNC_TX_ACK */ struct dma_async_tx_descriptor * async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, - unsigned int src_offset, size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) + unsigned int src_offset, size_t len, + struct async_submit_ctl *submit) { - struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY, + struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMCPY, &dest, 1, &src, 1, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; if (device) { dma_addr_t dma_dest, dma_src; - unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; + unsigned long dma_prep_flags; + dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; dma_dest = dma_map_page(device->dev, dest, dest_offset, len, DMA_FROM_DEVICE); @@ -67,13 +67,13 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, if (tx) { pr_debug("%s: (async) len: %zu\n", __func__, len); - async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + async_tx_submit(chan, tx, submit); } else { void *dest_buf, *src_buf; pr_debug("%s: (sync) len: %zu\n", __func__, len); /* wait for any prerequisite operations */ - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset; src_buf = kmap_atomic(src, KM_USER1) + src_offset; @@ -83,7 +83,7 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, kunmap_atomic(dest_buf, KM_USER0); kunmap_atomic(src_buf, KM_USER1); - async_tx_sync_epilog(cb_fn, cb_param); + async_tx_sync_epilog(submit); } return tx; diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c index b2f133885b7f..c14437238f4c 100644 --- a/crypto/async_tx/async_memset.c +++ b/crypto/async_tx/async_memset.c @@ -35,26 +35,23 @@ * @val: fill value * @offset: offset in pages to start transaction * @len: length in bytes - * @flags: ASYNC_TX_ACK - * @depend_tx: memset depends on the result of this transaction - * @cb_fn: function to call when the memcpy completes - * @cb_param: parameter to pass to the callback routine + * + * honored flags: ASYNC_TX_ACK */ struct dma_async_tx_descriptor * -async_memset(struct page *dest, int val, unsigned int offset, - size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) +async_memset(struct page *dest, int val, unsigned int offset, size_t len, + struct async_submit_ctl *submit) { - struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET, + struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET, &dest, 1, NULL, 0, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; if (device) { dma_addr_t dma_dest; - unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; + unsigned long dma_prep_flags; + dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; dma_dest = dma_map_page(device->dev, dest, offset, len, DMA_FROM_DEVICE); @@ -64,19 +61,19 @@ async_memset(struct page *dest, int val, unsigned int offset, if (tx) { pr_debug("%s: (async) len: %zu\n", __func__, len); - async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + async_tx_submit(chan, tx, submit); } else { /* run the memset synchronously */ void *dest_buf; pr_debug("%s: (sync) len: %zu\n", __func__, len); - dest_buf = (void *) (((char *) page_address(dest)) + offset); + dest_buf = page_address(dest) + offset; /* wait for any prerequisite operations */ - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); memset(dest_buf, val, len); - async_tx_sync_epilog(cb_fn, cb_param); + async_tx_sync_epilog(submit); } return tx; diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index 3766bc3d7d89..802a5ce437d9 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -45,13 +45,15 @@ static void __exit async_tx_exit(void) /** * __async_tx_find_channel - find a channel to carry out the operation or let * the transaction execute synchronously - * @depend_tx: transaction dependency + * @submit: transaction dependency and submission modifiers * @tx_type: transaction type */ struct dma_chan * -__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, - enum dma_transaction_type tx_type) +__async_tx_find_channel(struct async_submit_ctl *submit, + enum dma_transaction_type tx_type) { + struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; + /* see if we can keep the chain on one channel */ if (depend_tx && dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) @@ -144,13 +146,14 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, /** - * submit_disposition - while holding depend_tx->lock we must avoid submitting - * new operations to prevent a circular locking dependency with - * drivers that already hold a channel lock when calling - * async_tx_run_dependencies. + * submit_disposition - flags for routing an incoming operation * @ASYNC_TX_SUBMITTED: we were able to append the new operation under the lock * @ASYNC_TX_CHANNEL_SWITCH: when the lock is dropped schedule a channel switch * @ASYNC_TX_DIRECT_SUBMIT: when the lock is dropped submit directly + * + * while holding depend_tx->lock we must avoid submitting new operations + * to prevent a circular locking dependency with drivers that already + * hold a channel lock when calling async_tx_run_dependencies. */ enum submit_disposition { ASYNC_TX_SUBMITTED, @@ -160,11 +163,12 @@ enum submit_disposition { void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, - enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) + struct async_submit_ctl *submit) { - tx->callback = cb_fn; - tx->callback_param = cb_param; + struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; + + tx->callback = submit->cb_fn; + tx->callback_param = submit->cb_param; if (depend_tx) { enum submit_disposition s; @@ -220,7 +224,7 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, tx->tx_submit(tx); } - if (flags & ASYNC_TX_ACK) + if (submit->flags & ASYNC_TX_ACK) async_tx_ack(tx); if (depend_tx) @@ -229,21 +233,20 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, EXPORT_SYMBOL_GPL(async_tx_submit); /** - * async_trigger_callback - schedules the callback function to be run after - * any dependent operations have been completed. - * @flags: ASYNC_TX_ACK - * @depend_tx: 'callback' requires the completion of this transaction - * @cb_fn: function to call after depend_tx completes - * @cb_param: parameter to pass to the callback routine + * async_trigger_callback - schedules the callback function to be run + * @submit: submission and completion parameters + * + * honored flags: ASYNC_TX_ACK + * + * The callback is run after any dependent operations have completed. */ struct dma_async_tx_descriptor * -async_trigger_callback(enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) +async_trigger_callback(struct async_submit_ctl *submit) { struct dma_chan *chan; struct dma_device *device; struct dma_async_tx_descriptor *tx; + struct dma_async_tx_descriptor *depend_tx = submit->depend_tx; if (depend_tx) { chan = depend_tx->chan; @@ -262,14 +265,14 @@ async_trigger_callback(enum async_tx_flags flags, if (tx) { pr_debug("%s: (async)\n", __func__); - async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + async_tx_submit(chan, tx, submit); } else { pr_debug("%s: (sync)\n", __func__); /* wait for any prerequisite operations */ - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); - async_tx_sync_epilog(cb_fn, cb_param); + async_tx_sync_epilog(submit); } return tx; diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 3cc5dc763b54..691fa98a18c4 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -34,18 +34,16 @@ static __async_inline struct dma_async_tx_descriptor * do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, unsigned int offset, int src_cnt, size_t len, - enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) + struct async_submit_ctl *submit) { struct dma_device *dma = chan->device; dma_addr_t *dma_src = (dma_addr_t *) src_list; struct dma_async_tx_descriptor *tx = NULL; int src_off = 0; int i; - dma_async_tx_callback _cb_fn; - void *_cb_param; - enum async_tx_flags async_flags; + dma_async_tx_callback cb_fn_orig = submit->cb_fn; + void *cb_param_orig = submit->cb_param; + enum async_tx_flags flags_orig = submit->flags; enum dma_ctrl_flags dma_flags; int xor_src_cnt; dma_addr_t dma_dest; @@ -63,7 +61,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, } while (src_cnt) { - async_flags = flags; + submit->flags = flags_orig; dma_flags = 0; xor_src_cnt = min(src_cnt, dma->max_xor); /* if we are submitting additional xors, leave the chain open, @@ -71,15 +69,15 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, * buffer mapped */ if (src_cnt > xor_src_cnt) { - async_flags &= ~ASYNC_TX_ACK; + submit->flags &= ~ASYNC_TX_ACK; dma_flags = DMA_COMPL_SKIP_DEST_UNMAP; - _cb_fn = NULL; - _cb_param = NULL; + submit->cb_fn = NULL; + submit->cb_param = NULL; } else { - _cb_fn = cb_fn; - _cb_param = cb_param; + submit->cb_fn = cb_fn_orig; + submit->cb_param = cb_param_orig; } - if (_cb_fn) + if (submit->cb_fn) dma_flags |= DMA_PREP_INTERRUPT; /* Since we have clobbered the src_list we are committed @@ -90,7 +88,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, xor_src_cnt, len, dma_flags); if (unlikely(!tx)) - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); /* spin wait for the preceeding transactions to complete */ while (unlikely(!tx)) { @@ -101,10 +99,8 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, dma_flags); } - async_tx_submit(chan, tx, async_flags, depend_tx, _cb_fn, - _cb_param); - - depend_tx = tx; + async_tx_submit(chan, tx, submit); + submit->depend_tx = tx; if (src_cnt > xor_src_cnt) { /* drop completed sources */ @@ -123,8 +119,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, static void do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum async_tx_flags flags, - dma_async_tx_callback cb_fn, void *cb_param) + int src_cnt, size_t len, struct async_submit_ctl *submit) { int i; int xor_src_cnt; @@ -139,7 +134,7 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, /* set destination address */ dest_buf = page_address(dest) + offset; - if (flags & ASYNC_TX_XOR_ZERO_DST) + if (submit->flags & ASYNC_TX_XOR_ZERO_DST) memset(dest_buf, 0, len); while (src_cnt > 0) { @@ -152,33 +147,35 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, src_off += xor_src_cnt; } - async_tx_sync_epilog(cb_fn, cb_param); + async_tx_sync_epilog(submit); } /** * async_xor - attempt to xor a set of blocks with a dma engine. - * xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST - * flag must be set to not include dest data in the calculation. The - * assumption with dma eninges is that they only use the destination - * buffer as a source when it is explicity specified in the source list. * @dest: destination page - * @src_list: array of source pages (if the dest is also a source it must be - * at index zero). The contents of this array may be overwritten. - * @offset: offset in pages to start transaction + * @src_list: array of source pages + * @offset: common src/dst offset to start transaction * @src_cnt: number of source pages * @len: length in bytes - * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, ASYNC_TX_ACK - * @depend_tx: xor depends on the result of this transaction. - * @cb_fn: function to call when the xor completes - * @cb_param: parameter to pass to the callback routine + * @submit: submission / completion modifiers + * + * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST + * + * xor_blocks always uses the dest as a source so the + * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in + * the calculation. The assumption with dma eninges is that they only + * use the destination buffer as a source when it is explicity specified + * in the source list. + * + * src_list note: if the dest is also a source it must be at index zero. + * The contents of this array will be overwritten if a scribble region + * is not specified. */ struct dma_async_tx_descriptor * async_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) + int src_cnt, size_t len, struct async_submit_ctl *submit) { - struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR, + struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR, &dest, 1, src_list, src_cnt, len); BUG_ON(src_cnt <= 1); @@ -188,7 +185,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, pr_debug("%s (async): len: %zu\n", __func__, len); return do_async_xor(chan, dest, src_list, offset, src_cnt, len, - flags, depend_tx, cb_fn, cb_param); + submit); } else { /* run the xor synchronously */ pr_debug("%s (sync): len: %zu\n", __func__, len); @@ -196,16 +193,15 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, /* in the sync case the dest is an implied source * (assumes the dest is the first source) */ - if (flags & ASYNC_TX_XOR_DROP_DST) { + if (submit->flags & ASYNC_TX_XOR_DROP_DST) { src_cnt--; src_list++; } /* wait for any prerequisite operations */ - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); - do_sync_xor(dest, src_list, offset, src_cnt, len, - flags, cb_fn, cb_param); + do_sync_xor(dest, src_list, offset, src_cnt, len, submit); return NULL; } @@ -222,25 +218,25 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len) /** * async_xor_val - attempt a xor parity check with a dma engine. * @dest: destination page used if the xor is performed synchronously - * @src_list: array of source pages. The dest page must be listed as a source - * at index zero. The contents of this array may be overwritten. + * @src_list: array of source pages * @offset: offset in pages to start transaction * @src_cnt: number of source pages * @len: length in bytes * @result: 0 if sum == 0 else non-zero - * @flags: ASYNC_TX_ACK - * @depend_tx: xor depends on the result of this transaction. - * @cb_fn: function to call when the xor completes - * @cb_param: parameter to pass to the callback routine + * @submit: submission / completion modifiers + * + * honored flags: ASYNC_TX_ACK + * + * src_list note: if the dest is also a source it must be at index zero. + * The contents of this array will be overwritten if a scribble region + * is not specified. */ struct dma_async_tx_descriptor * -async_xor_val(struct page *dest, struct page **src_list, - unsigned int offset, int src_cnt, size_t len, - u32 *result, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_param) +async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, + int src_cnt, size_t len, u32 *result, + struct async_submit_ctl *submit) { - struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR_VAL, + struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL, &dest, 1, src_list, src_cnt, len); struct dma_device *device = chan ? chan->device : NULL; @@ -250,11 +246,12 @@ async_xor_val(struct page *dest, struct page **src_list, if (device && src_cnt <= device->max_xor) { dma_addr_t *dma_src = (dma_addr_t *) src_list; - unsigned long dma_prep_flags = cb_fn ? DMA_PREP_INTERRUPT : 0; + unsigned long dma_prep_flags; int i; pr_debug("%s: (async) len: %zu\n", __func__, len); + dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; for (i = 0; i < src_cnt; i++) dma_src[i] = dma_map_page(device->dev, src_list[i], offset, len, DMA_TO_DEVICE); @@ -263,7 +260,7 @@ async_xor_val(struct page *dest, struct page **src_list, len, result, dma_prep_flags); if (unlikely(!tx)) { - async_tx_quiesce(&depend_tx); + async_tx_quiesce(&submit->depend_tx); while (!tx) { dma_async_issue_pending(chan); @@ -273,23 +270,23 @@ async_xor_val(struct page *dest, struct page **src_list, } } - async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + async_tx_submit(chan, tx, submit); } else { - unsigned long xor_flags = flags; + enum async_tx_flags flags_orig = submit->flags; pr_debug("%s: (sync) len: %zu\n", __func__, len); - xor_flags |= ASYNC_TX_XOR_DROP_DST; - xor_flags &= ~ASYNC_TX_ACK; + submit->flags |= ASYNC_TX_XOR_DROP_DST; + submit->flags &= ~ASYNC_TX_ACK; - tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags, - depend_tx, NULL, NULL); + tx = async_xor(dest, src_list, offset, src_cnt, len, submit); async_tx_quiesce(&tx); *result = page_is_zero(dest, offset, len) ? 0 : 1; - async_tx_sync_epilog(cb_fn, cb_param); + async_tx_sync_epilog(submit); + submit->flags = flags_orig; } return tx; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ef5362c8d02..e1920f23579f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -499,11 +499,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, struct page *bio_page; int i; int page_offset; + struct async_submit_ctl submit; if (bio->bi_sector >= sector) page_offset = (signed)(bio->bi_sector - sector) * 512; else page_offset = (signed)(sector - bio->bi_sector) * -512; + + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); bio_for_each_segment(bvl, bio, i) { int len = bio_iovec_idx(bio, i)->bv_len; int clen; @@ -525,13 +528,14 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, bio_page = bio_iovec_idx(bio, i)->bv_page; if (frombio) tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, 0, - tx, NULL, NULL); + b_offset, clen, &submit); else tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, 0, - tx, NULL, NULL); + page_offset, clen, &submit); } + /* chain the operations */ + submit.depend_tx = tx; + if (clen < len) /* hit end of page */ break; page_offset += len; @@ -590,6 +594,7 @@ static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; raid5_conf_t *conf = sh->raid_conf; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -613,7 +618,8 @@ static void ops_run_biofill(struct stripe_head *sh) } atomic_inc(&sh->count); - async_trigger_callback(ASYNC_TX_ACK, tx, ops_complete_biofill, sh); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); + async_trigger_callback(&submit); } static void ops_complete_compute5(void *stripe_head_ref) @@ -645,6 +651,7 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) struct page *xor_dest = tgt->page; int count = 0; struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu block: %d\n", @@ -657,13 +664,12 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute5, sh, NULL); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - 0, NULL, ops_complete_compute5, sh); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute5, sh); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } @@ -683,6 +689,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) int disks = sh->disks; struct page *xor_srcs[disks]; int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; @@ -697,9 +704,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) xor_srcs[count++] = dev->page; } - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh); + init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh, NULL); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } @@ -772,7 +779,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) /* kernel stack size limits the total number of disks */ int disks = sh->disks; struct page *xor_srcs[disks]; - + struct async_submit_ctl submit; int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; @@ -811,13 +818,11 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) atomic_inc(&sh->count); - if (unlikely(count == 1)) { - flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); - } else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); + init_async_submit(&submit, flags, tx, ops_complete_postxor, sh, NULL); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); } static void ops_complete_check(void *stripe_head_ref) @@ -838,6 +843,7 @@ static void ops_run_check(struct stripe_head *sh) int disks = sh->disks; struct page *xor_srcs[disks]; struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; @@ -851,12 +857,13 @@ static void ops_run_check(struct stripe_head *sh) xor_srcs[count++] = dev->page; } + init_async_submit(&submit, 0, NULL, NULL, NULL, NULL); tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); + &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); - tx = async_trigger_callback(ASYNC_TX_ACK, tx, - ops_complete_check, sh); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); + tx = async_trigger_callback(&submit); } static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) @@ -2664,6 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, if (i != sh->pd_idx && i != sh->qd_idx) { int dd_idx, j; struct stripe_head *sh2; + struct async_submit_ctl submit; sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, @@ -2683,9 +2691,10 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } /* place all the copies on one channel */ + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, sh->dev[i].page, 0, 0, STRIPE_SIZE, - 0, tx, NULL, NULL); + &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 9f14cd540cd2..00cfb637ddf2 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -65,6 +65,22 @@ enum async_tx_flags { ASYNC_TX_ACK = (1 << 2), }; +/** + * struct async_submit_ctl - async_tx submission/completion modifiers + * @flags: submission modifiers + * @depend_tx: parent dependency of the current operation being submitted + * @cb_fn: callback routine to run at operation completion + * @cb_param: parameter for the callback routine + * @scribble: caller provided space for dma/page address conversions + */ +struct async_submit_ctl { + enum async_tx_flags flags; + struct dma_async_tx_descriptor *depend_tx; + dma_async_tx_callback cb_fn; + void *cb_param; + void *scribble; +}; + #ifdef CONFIG_DMA_ENGINE #define async_tx_issue_pending_all dma_issue_pending_all #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL @@ -73,8 +89,8 @@ enum async_tx_flags { #define async_tx_find_channel(dep, type, dst, dst_count, src, src_count, len) \ __async_tx_find_channel(dep, type) struct dma_chan * -__async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, - enum dma_transaction_type tx_type); +__async_tx_find_channel(struct async_submit_ctl *submit, + enum dma_transaction_type tx_type); #endif /* CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL */ #else static inline void async_tx_issue_pending_all(void) @@ -83,9 +99,10 @@ static inline void async_tx_issue_pending_all(void) } static inline struct dma_chan * -async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, - enum dma_transaction_type tx_type, struct page **dst, int dst_count, - struct page **src, int src_count, size_t len) +async_tx_find_channel(struct async_submit_ctl *submit, + enum dma_transaction_type tx_type, struct page **dst, + int dst_count, struct page **src, int src_count, + size_t len) { return NULL; } @@ -97,46 +114,53 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, * @cb_fn_param: parameter to pass to the callback routine */ static inline void -async_tx_sync_epilog(dma_async_tx_callback cb_fn, void *cb_fn_param) +async_tx_sync_epilog(struct async_submit_ctl *submit) +{ + if (submit->cb_fn) + submit->cb_fn(submit->cb_param); +} + +typedef union { + unsigned long addr; + struct page *page; + dma_addr_t dma; +} addr_conv_t; + +static inline void +init_async_submit(struct async_submit_ctl *args, enum async_tx_flags flags, + struct dma_async_tx_descriptor *tx, + dma_async_tx_callback cb_fn, void *cb_param, + addr_conv_t *scribble) { - if (cb_fn) - cb_fn(cb_fn_param); + args->flags = flags; + args->depend_tx = tx; + args->cb_fn = cb_fn; + args->cb_param = cb_param; + args->scribble = scribble; } -void -async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, - enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); +void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, + struct async_submit_ctl *submit); struct dma_async_tx_descriptor * async_xor(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); + int src_cnt, size_t len, struct async_submit_ctl *submit); struct dma_async_tx_descriptor * -async_xor_val(struct page *dest, struct page **src_list, - unsigned int offset, int src_cnt, size_t len, - u32 *result, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); +async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, + int src_cnt, size_t len, u32 *result, + struct async_submit_ctl *submit); struct dma_async_tx_descriptor * async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, - unsigned int src_offset, size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); + unsigned int src_offset, size_t len, + struct async_submit_ctl *submit); struct dma_async_tx_descriptor * async_memset(struct page *dest, int val, unsigned int offset, - size_t len, enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); + size_t len, struct async_submit_ctl *submit); -struct dma_async_tx_descriptor * -async_trigger_callback(enum async_tx_flags flags, - struct dma_async_tx_descriptor *depend_tx, - dma_async_tx_callback cb_fn, void *cb_fn_param); +struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit); void async_tx_quiesce(struct dma_async_tx_descriptor **tx); #endif /* _ASYNC_TX_H_ */ -- cgit v1.2.3-71-gd317 From a635f9dd83f3382577f4544a96df12356e951a40 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 12 Jun 2009 15:20:55 +0200 Subject: HID: use debugfs for report dumping descriptor It is a little bit inconvenient for people who have some non-standard HID hardware (usually violating the HID specification) to have to recompile kernel with CONFIG_HID_DEBUG to be able to see kernel's perspective of the HID report descriptor and observe the parsed events. Plus the messages are then mixed up inconveniently with the rest of the dmesg stuff. This patch implements /sys/kernel/debug/hid//rdesc file, which represents the kernel's view of report descriptor (both the raw report descriptor data and parsed contents). With all the device-specific debug data being available through debugfs, there is no need for keeping CONFIG_HID_DEBUG, as the 'debug' parameter to the hid module will now only output only driver-specific debugging options, which has absolutely minimal memory footprint, just a few error messages and one global flag (hid_debug). We use the current set of output formatting functions. The ones that need to be used both for one-shot rdesc seq_file and also for continuous flow of data (individual reports, as being sent by the device) distinguish according to the passed seq_file parameter, and if it is NULL, it still output to kernel ringbuffer, otherwise the corresponding seq_file is used for output. The format of the output is preserved. Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 15 --- drivers/hid/Makefile | 5 +- drivers/hid/hid-core.c | 13 ++- drivers/hid/hid-debug.c | 227 +++++++++++++++++++++++++++++------------- drivers/hid/hid-input.c | 13 +-- drivers/hid/usbhid/hid-core.c | 8 +- include/linux/hid-debug.h | 32 +++--- include/linux/hid.h | 4 + 8 files changed, 195 insertions(+), 122 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 7e67dcb3d4f6..05950a783560 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -31,21 +31,6 @@ config HID If unsure, say Y. -config HID_DEBUG - bool "HID debugging support" - default y - depends on HID - ---help--- - This option lets the HID layer output diagnostics about its internal - state, resolve HID usages, dump HID fields, etc. Individual HID drivers - use this debugging facility to output information about individual HID - devices, etc. - - This feature is useful for those who are either debugging the HID parser - or any HID hardware device. - - If unsure, say Y. - config HIDRAW bool "/dev/hidraw raw HID device support" depends on HID diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index 1f7cb0fd4505..cf3687d1ed63 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -3,9 +3,12 @@ # hid-objs := hid-core.o hid-input.o +ifdef CONFIG_DEBUG_FS + hid-objs += hid-debug.o +endif + obj-$(CONFIG_HID) += hid.o -hid-$(CONFIG_HID_DEBUG) += hid-debug.o hid-$(CONFIG_HIDRAW) += hidraw.o hid-logitech-objs := hid-lg.o diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 8551693d645f..d4317db85b54 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -44,12 +44,10 @@ #define DRIVER_DESC "HID core driver" #define DRIVER_LICENSE "GPL" -#ifdef CONFIG_HID_DEBUG int hid_debug = 0; module_param_named(debug, hid_debug, int, 0600); MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)"); EXPORT_SYMBOL_GPL(hid_debug); -#endif /* * Register a new report for a device. @@ -987,7 +985,6 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) if (offset >= field->report_count) { dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count); - hid_dump_field(field, 8); return -1; } if (field->logical_minimum < 0) { @@ -1721,6 +1718,8 @@ int hid_add_device(struct hid_device *hdev) if (!ret) hdev->status |= HID_STAT_ADDED; + hid_debug_register(hdev, dev_name(&hdev->dev)); + return ret; } EXPORT_SYMBOL_GPL(hid_add_device); @@ -1768,6 +1767,7 @@ static void hid_remove_device(struct hid_device *hdev) { if (hdev->status & HID_STAT_ADDED) { device_del(&hdev->dev); + hid_debug_unregister(hdev); hdev->status &= ~HID_STAT_ADDED; } } @@ -1843,6 +1843,10 @@ static int __init hid_init(void) { int ret; + if (hid_debug) + printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. " + "Debugging data are now provided via debugfs\n"); + ret = bus_register(&hid_bus_type); if (ret) { printk(KERN_ERR "HID: can't register hid bus\n"); @@ -1853,6 +1857,8 @@ static int __init hid_init(void) if (ret) goto err_bus; + hid_debug_init(); + return 0; err_bus: bus_unregister(&hid_bus_type); @@ -1862,6 +1868,7 @@ err: static void __exit hid_exit(void) { + hid_debug_exit(); hidraw_exit(); bus_unregister(&hid_bus_type); } diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 47ac1a7d66e1..067e173aa3e4 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -1,9 +1,9 @@ /* * (c) 1999 Andreas Gal * (c) 2000-2001 Vojtech Pavlik - * (c) 2007 Jiri Kosina + * (c) 2007-2009 Jiri Kosina * - * Some debug stuff for the HID parser. + * HID debugging support */ /* @@ -26,9 +26,13 @@ * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ +#include +#include #include #include +static struct dentry *hid_debug_root; + struct hid_usage_entry { unsigned page; unsigned usage; @@ -331,72 +335,83 @@ static const struct hid_usage_entry hid_usage_table[] = { { 0, 0, NULL } }; -static void resolv_usage_page(unsigned page) { +static void resolv_usage_page(unsigned page, struct seq_file *f) { const struct hid_usage_entry *p; for (p = hid_usage_table; p->description; p++) if (p->page == page) { - printk("%s", p->description); + if (!f) + printk("%s", p->description); + else + seq_printf(f, "%s", p->description); return; } - printk("%04x", page); + if (!f) + printk("%04x", page); + else + seq_printf(f, "%04x", page); } -void hid_resolv_usage(unsigned usage) { +void hid_resolv_usage(unsigned usage, struct seq_file *f) { const struct hid_usage_entry *p; - if (!hid_debug) - return; - - resolv_usage_page(usage >> 16); - printk("."); + resolv_usage_page(usage >> 16, f); + if (!f) + printk("."); + else + seq_printf(f, "."); for (p = hid_usage_table; p->description; p++) if (p->page == (usage >> 16)) { for(++p; p->description && p->usage != 0; p++) if (p->usage == (usage & 0xffff)) { - printk("%s", p->description); + if (!f) + printk("%s", p->description); + else + seq_printf(f, + "%s", + p->description); return; } break; } - printk("%04x", usage & 0xffff); + if (!f) + printk("%04x", usage & 0xffff); + else + seq_printf(f, "%04x", usage & 0xffff); } EXPORT_SYMBOL_GPL(hid_resolv_usage); -static void tab(int n) { - printk(KERN_DEBUG "%*s", n, ""); +static void tab(int n, struct seq_file *f) { + seq_printf(f, "%*s", n, ""); } -void hid_dump_field(struct hid_field *field, int n) { +void hid_dump_field(struct hid_field *field, int n, struct seq_file *f) { int j; - if (!hid_debug) - return; - if (field->physical) { - tab(n); - printk("Physical("); - hid_resolv_usage(field->physical); printk(")\n"); + tab(n, f); + seq_printf(f, "Physical("); + hid_resolv_usage(field->physical, f); seq_printf(f, ")\n"); } if (field->logical) { - tab(n); - printk("Logical("); - hid_resolv_usage(field->logical); printk(")\n"); + tab(n, f); + seq_printf(f, "Logical("); + hid_resolv_usage(field->logical, f); seq_printf(f, ")\n"); } - tab(n); printk("Usage(%d)\n", field->maxusage); + tab(n, f); seq_printf(f, "Usage(%d)\n", field->maxusage); for (j = 0; j < field->maxusage; j++) { - tab(n+2); hid_resolv_usage(field->usage[j].hid); printk("\n"); + tab(n+2, f); hid_resolv_usage(field->usage[j].hid, f); seq_printf(f, "\n"); } if (field->logical_minimum != field->logical_maximum) { - tab(n); printk("Logical Minimum(%d)\n", field->logical_minimum); - tab(n); printk("Logical Maximum(%d)\n", field->logical_maximum); + tab(n, f); seq_printf(f, "Logical Minimum(%d)\n", field->logical_minimum); + tab(n, f); seq_printf(f, "Logical Maximum(%d)\n", field->logical_maximum); } if (field->physical_minimum != field->physical_maximum) { - tab(n); printk("Physical Minimum(%d)\n", field->physical_minimum); - tab(n); printk("Physical Maximum(%d)\n", field->physical_maximum); + tab(n, f); seq_printf(f, "Physical Minimum(%d)\n", field->physical_minimum); + tab(n, f); seq_printf(f, "Physical Maximum(%d)\n", field->physical_maximum); } if (field->unit_exponent) { - tab(n); printk("Unit Exponent(%d)\n", field->unit_exponent); + tab(n, f); seq_printf(f, "Unit Exponent(%d)\n", field->unit_exponent); } if (field->unit) { static const char *systems[5] = { "None", "SI Linear", "SI Rotation", "English Linear", "English Rotation" }; @@ -417,77 +432,75 @@ void hid_dump_field(struct hid_field *field, int n) { data >>= 4; if(sys > 4) { - tab(n); printk("Unit(Invalid)\n"); + tab(n, f); seq_printf(f, "Unit(Invalid)\n"); } else { int earlier_unit = 0; - tab(n); printk("Unit(%s : ", systems[sys]); + tab(n, f); seq_printf(f, "Unit(%s : ", systems[sys]); for (i=1 ; i>= 4; if (nibble != 0) { if(earlier_unit++ > 0) - printk("*"); - printk("%s", units[sys][i]); + seq_printf(f, "*"); + seq_printf(f, "%s", units[sys][i]); if(nibble != 1) { /* This is a _signed_ nibble(!) */ int val = nibble & 0x7; if(nibble & 0x08) val = -((0x7 & ~val) +1); - printk("^%d", val); + seq_printf(f, "^%d", val); } } } - printk(")\n"); + seq_printf(f, ")\n"); } } - tab(n); printk("Report Size(%u)\n", field->report_size); - tab(n); printk("Report Count(%u)\n", field->report_count); - tab(n); printk("Report Offset(%u)\n", field->report_offset); + tab(n, f); seq_printf(f, "Report Size(%u)\n", field->report_size); + tab(n, f); seq_printf(f, "Report Count(%u)\n", field->report_count); + tab(n, f); seq_printf(f, "Report Offset(%u)\n", field->report_offset); - tab(n); printk("Flags( "); + tab(n, f); seq_printf(f, "Flags( "); j = field->flags; - printk("%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : ""); - printk("%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array "); - printk("%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute "); - printk("%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : ""); - printk("%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : ""); - printk("%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : ""); - printk("%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : ""); - printk("%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : ""); - printk("%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : ""); - printk(")\n"); + seq_printf(f, "%s", HID_MAIN_ITEM_CONSTANT & j ? "Constant " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_VARIABLE & j ? "Variable " : "Array "); + seq_printf(f, "%s", HID_MAIN_ITEM_RELATIVE & j ? "Relative " : "Absolute "); + seq_printf(f, "%s", HID_MAIN_ITEM_WRAP & j ? "Wrap " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NONLINEAR & j ? "NonLinear " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NO_PREFERRED & j ? "NoPreferredState " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_NULL_STATE & j ? "NullState " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_VOLATILE & j ? "Volatile " : ""); + seq_printf(f, "%s", HID_MAIN_ITEM_BUFFERED_BYTE & j ? "BufferedByte " : ""); + seq_printf(f, ")\n"); } EXPORT_SYMBOL_GPL(hid_dump_field); -void hid_dump_device(struct hid_device *device) { +void hid_dump_device(struct hid_device *device, struct seq_file *f) +{ struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; unsigned i,k; static const char *table[] = {"INPUT", "OUTPUT", "FEATURE"}; - if (!hid_debug) - return; - for (i = 0; i < HID_REPORT_TYPES; i++) { report_enum = device->report_enum + i; list = report_enum->report_list.next; while (list != &report_enum->report_list) { report = (struct hid_report *) list; - tab(2); - printk("%s", table[i]); + tab(2, f); + seq_printf(f, "%s", table[i]); if (report->id) - printk("(%d)", report->id); - printk("[%s]", table[report->type]); - printk("\n"); + seq_printf(f, "(%d)", report->id); + seq_printf(f, "[%s]", table[report->type]); + seq_printf(f, "\n"); for (k = 0; k < report->maxfield; k++) { - tab(4); - printk("Field(%d)\n", k); - hid_dump_field(report->field[k], 6); + tab(4, f); + seq_printf(f, "Field(%d)\n", k); + hid_dump_field(report->field[k], 6, f); } list = list->next; } @@ -500,7 +513,7 @@ void hid_dump_input(struct hid_usage *usage, __s32 value) { return; printk(KERN_DEBUG "hid-debug: input "); - hid_resolv_usage(usage->hid); + hid_resolv_usage(usage->hid, NULL); printk(" = %d\n", value); } EXPORT_SYMBOL_GPL(hid_dump_input); @@ -767,12 +780,84 @@ static const char **names[EV_MAX + 1] = { [EV_SND] = sounds, [EV_REP] = repeats, }; -void hid_resolv_event(__u8 type, __u16 code) { - - if (!hid_debug) - return; +void hid_resolv_event(__u8 type, __u16 code, struct seq_file *f) { - printk("%s.%s", events[type] ? events[type] : "?", + seq_printf(f, "%s.%s", events[type] ? events[type] : "?", names[type] ? (names[type][code] ? names[type][code] : "?") : "?"); } -EXPORT_SYMBOL_GPL(hid_resolv_event); + +void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f) +{ + int i, j, k; + struct hid_report *report; + struct hid_usage *usage; + + for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) { + list_for_each_entry(report, &hid->report_enum[k].report_list, list) { + for (i = 0; i < report->maxfield; i++) { + for ( j = 0; j < report->field[i]->maxusage; j++) { + usage = report->field[i]->usage + j; + hid_resolv_usage(usage->hid, f); + seq_printf(f, " ---> "); + hid_resolv_event(usage->type, usage->code, f); + seq_printf(f, "\n"); + } + } + } + } + +} + +static int hid_debug_rdesc_show(struct seq_file *f, void *p) +{ + struct hid_device *hdev = f->private; + int i; + + /* dump HID report descriptor */ + for (i = 0; i < hdev->rsize; i++) + seq_printf(f, "%02x ", hdev->rdesc[i]); + seq_printf(f, "\n\n"); + + /* dump parsed data and input mappings */ + hid_dump_device(hdev, f); + seq_printf(f, "\n"); + hid_dump_input_mapping(hdev, f); + + return 0; +} + +static int hid_debug_rdesc_open(struct inode *inode, struct file *file) +{ + return single_open(file, hid_debug_rdesc_show, inode->i_private); +} + +static const struct file_operations hid_debug_rdesc_fops = { + .open = hid_debug_rdesc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void hid_debug_register(struct hid_device *hdev, const char *name) +{ + hdev->debug_dir = debugfs_create_dir(name, hid_debug_root); + hdev->debug_rdesc = debugfs_create_file("rdesc", 0400, + hdev->debug_dir, hdev, &hid_debug_rdesc_fops); +} + +void hid_debug_unregister(struct hid_device *hdev) +{ + debugfs_remove(hdev->debug_rdesc); + debugfs_remove(hdev->debug_dir); +} + +void hid_debug_init(void) +{ + hid_debug_root = debugfs_create_dir("hid", NULL); +} + +void hid_debug_exit(void) +{ + debugfs_remove_recursive(hid_debug_root); +} + diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 7f183b7147e1..5862b0f3b55d 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -159,17 +159,12 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel field->hidinput = hidinput; - dbg_hid("Mapping: "); - hid_resolv_usage(usage->hid); - dbg_hid_line(" ---> "); - if (field->flags & HID_MAIN_ITEM_CONSTANT) goto ignore; /* only LED usages are supported in output fields */ if (field->report_type == HID_OUTPUT_REPORT && (usage->hid & HID_USAGE_PAGE) != HID_UP_LED) { - dbg_hid_line(" [non-LED output field] "); goto ignore; } @@ -561,15 +556,9 @@ mapped: set_bit(MSC_SCAN, input->mscbit); } - hid_resolv_event(usage->type, usage->code); - - dbg_hid_line("\n"); - - return; - ignore: - dbg_hid_line("IGNORED\n"); return; + } void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index ac8049b5f1e9..708aa52d0753 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -4,8 +4,8 @@ * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik * Copyright (c) 2005 Michael Haboustak for Concept2, Inc - * Copyright (c) 2006-2008 Jiri Kosina * Copyright (c) 2007-2008 Oliver Neukum + * Copyright (c) 2006-2009 Jiri Kosina */ /* @@ -886,11 +886,6 @@ static int usbhid_parse(struct hid_device *hid) goto err; } - dbg_hid("report descriptor (size %u, read %d) = ", rsize, n); - for (n = 0; n < rsize; n++) - dbg_hid_line(" %02x", (unsigned char) rdesc[n]); - dbg_hid_line("\n"); - ret = hid_parse_report(hid, rdesc, rsize); kfree(rdesc); if (ret) { @@ -1005,7 +1000,6 @@ static int usbhid_start(struct hid_device *hid) usbhid->urbctrl->transfer_flags |= (URB_NO_TRANSFER_DMA_MAP | URB_NO_SETUP_DMA_MAP); usbhid_init_reports(hid); - hid_dump_device(hid); set_bit(HID_STARTED, &usbhid->iofl); diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index 50d568ec178a..516e12c33235 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -2,7 +2,7 @@ #define __HID_DEBUG_H /* - * Copyright (c) 2007 Jiri Kosina + * Copyright (c) 2007-2009 Jiri Kosina */ /* @@ -22,24 +22,30 @@ * */ -#ifdef CONFIG_HID_DEBUG +#ifdef CONFIG_DEBUG_FS void hid_dump_input(struct hid_usage *, __s32); -void hid_dump_device(struct hid_device *); -void hid_dump_field(struct hid_field *, int); -void hid_resolv_usage(unsigned); -void hid_resolv_event(__u8, __u16); +void hid_dump_device(struct hid_device *, struct seq_file *); +void hid_dump_field(struct hid_field *, int, struct seq_file *); +void hid_resolv_usage(unsigned, struct seq_file *); +void hid_debug_register(struct hid_device *, const char *); +void hid_debug_unregister(struct hid_device *); +void hid_debug_init(void); +void hid_debug_exit(void); #else -#define hid_dump_input(a,b) do { } while (0) -#define hid_dump_device(c) do { } while (0) -#define hid_dump_field(a,b) do { } while (0) -#define hid_resolv_usage(a) do { } while (0) -#define hid_resolv_event(a,b) do { } while (0) - -#endif /* CONFIG_HID_DEBUG */ +#define hid_dump_input(a,b) do { } while (0) +#define hid_dump_device(c) do { } while (0) +#define hid_dump_field(a,b) do { } while (0) +#define hid_resolv_usage(a) do { } while (0) +#define hid_resolv_event(a,b) do { } while (0) +#define hid_debug_register(a, b) do { } while (0) +#define hid_debug_unregister(a) do { } while (0) +#define hid_debug_init() do { } while (0) +#define hid_debug_exit() do { } while (0) +#endif #endif diff --git a/include/linux/hid.h b/include/linux/hid.h index a72876e43589..da09ab140ef1 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -451,6 +451,10 @@ struct hid_device { /* device report descriptor */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ + /* debugfs */ + struct dentry *debug_dir; + struct dentry *debug_rdesc; + void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ -- cgit v1.2.3-71-gd317 From cd667ce24796700e1a0e6e7528efc61c96ff832e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 12 Jun 2009 15:20:57 +0200 Subject: HID: use debugfs for events/reports dumping This is a followup patch to the one implemeting rdesc representation in debugfs rather than being dependent on compile-time CONFIG_HID_DEBUG setting. The API of the appropriate formatting functions is slightly modified -- if they are passed seq_file pointer, the one-shot output for 'rdesc' file mode is used, and therefore the message is formatted into the corresponding seq_file immediately. Otherwise the called function allocated a new buffer, formats the text into the buffer and returns the pointer to it, so that it can be queued into the ring-buffer of the processess blocked waiting on input on 'events' file in debugfs. 'debug' parameter to the 'hid' module is now used solely for the prupose of inetrnal driver state debugging (parser, transport, etc). Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 42 ++++++-- drivers/hid/hid-debug.c | 239 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/hid-debug.h | 18 +++- include/linux/hid.h | 26 ++--- 4 files changed, 276 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index d4317db85b54..449bd747d116 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -46,7 +46,7 @@ int hid_debug = 0; module_param_named(debug, hid_debug, int, 0600); -MODULE_PARM_DESC(debug, "HID debugging (0=off, 1=probing info, 2=continuous data dumping)"); +MODULE_PARM_DESC(debug, "toggle HID debugging messages"); EXPORT_SYMBOL_GPL(hid_debug); /* @@ -859,7 +859,7 @@ static void hid_process_event(struct hid_device *hid, struct hid_field *field, struct hid_driver *hdrv = hid->driver; int ret; - hid_dump_input(usage, value); + hid_dump_input(hid, usage, value); if (hdrv && hdrv->event && hid_match_usage(hid, usage)) { ret = hdrv->event(hid, field, usage, value); @@ -981,7 +981,7 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) { unsigned size = field->report_size; - hid_dump_input(field->usage + offset, value); + hid_dump_input(field->report->device, field->usage + offset, value); if (offset >= field->report_count) { dbg_hid("offset (%d) exceeds report_count (%d)\n", offset, field->report_count); @@ -1075,6 +1075,7 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_driver *hdrv = hid->driver; struct hid_report *report; + char *buf; unsigned int i; int ret; @@ -1086,18 +1087,36 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i return -1; } - dbg_hid("report (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un"); + buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, + interrupt ? GFP_ATOMIC : GFP_KERNEL); + + if (!buf) { + report = hid_get_report(report_enum, data); + goto nomem; + } + + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + "\nreport (size %u) (%snumbered)\n", size, report_enum->numbered ? "" : "un"); + hid_debug_event(hid, buf); report = hid_get_report(report_enum, data); if (!report) return -1; /* dump the report */ - dbg_hid("report %d (size %u) = ", report->id, size); - for (i = 0; i < size; i++) - dbg_hid_line(" %02x", data[i]); - dbg_hid_line("\n"); + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + "report %d (size %u) = ", report->id, size); + hid_debug_event(hid, buf); + for (i = 0; i < size; i++) { + snprintf(buf, HID_DEBUG_BUFSIZE - 1, + " %02x", data[i]); + hid_debug_event(hid, buf); + } + hid_debug_event(hid, "\n"); + + kfree(buf); +nomem: if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) { ret = hdrv->raw_event(hid, report, data, size); if (ret != 0) @@ -1756,6 +1775,9 @@ struct hid_device *hid_allocate_device(void) for (i = 0; i < HID_REPORT_TYPES; i++) INIT_LIST_HEAD(&hdev->report_enum[i].report_list); + init_waitqueue_head(&hdev->debug_wait); + INIT_LIST_HEAD(&hdev->debug_list); + return hdev; err: put_device(&hdev->dev); @@ -1844,8 +1866,8 @@ static int __init hid_init(void) int ret; if (hid_debug) - printk(KERN_WARNING "HID: hid_debug parameter has been deprecated. " - "Debugging data are now provided via debugfs\n"); + printk(KERN_WARNING "HID: hid_debug is now used solely for parser and driver debugging.\n" + "HID: debugfs is now used for inspecting the device (report descriptor, reports)\n"); ret = bus_register(&hid_bus_type); if (ret) { diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 067e173aa3e4..a331a1821e85 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -28,6 +28,10 @@ #include #include +#include +#include +#include + #include #include @@ -335,49 +339,86 @@ static const struct hid_usage_entry hid_usage_table[] = { { 0, 0, NULL } }; -static void resolv_usage_page(unsigned page, struct seq_file *f) { +/* Either output directly into simple seq_file, or (if f == NULL) + * allocate a separate buffer that will then be passed to the 'events' + * ringbuffer. + * + * This is because these functions can be called both for "one-shot" + * "rdesc" while resolving, or for blocking "events". + * + * This holds both for resolv_usage_page() and hid_resolv_usage(). + */ +static char *resolv_usage_page(unsigned page, struct seq_file *f) { const struct hid_usage_entry *p; + char *buf = NULL; + + if (!f) { + buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC); + if (!buf) + return ERR_PTR(-ENOMEM); + } for (p = hid_usage_table; p->description; p++) if (p->page == page) { - if (!f) - printk("%s", p->description); - else + if (!f) { + snprintf(buf, HID_DEBUG_BUFSIZE, "%s", + p->description); + return buf; + } + else { seq_printf(f, "%s", p->description); - return; + return NULL; + } } if (!f) - printk("%04x", page); + snprintf(buf, HID_DEBUG_BUFSIZE, "%04x", page); else seq_printf(f, "%04x", page); + return buf; } -void hid_resolv_usage(unsigned usage, struct seq_file *f) { +char *hid_resolv_usage(unsigned usage, struct seq_file *f) { const struct hid_usage_entry *p; + char *buf = NULL; + int len = 0; - resolv_usage_page(usage >> 16, f); - if (!f) - printk("."); - else + buf = resolv_usage_page(usage >> 16, f); + if (IS_ERR(buf)) { + printk(KERN_ERR "error allocating HID debug buffer\n"); + return NULL; + } + + + if (!f) { + len = strlen(buf); + snprintf(buf+len, max(0, HID_DEBUG_BUFSIZE - len), "."); + len++; + } + else { seq_printf(f, "."); + } for (p = hid_usage_table; p->description; p++) if (p->page == (usage >> 16)) { for(++p; p->description && p->usage != 0; p++) if (p->usage == (usage & 0xffff)) { if (!f) - printk("%s", p->description); + snprintf(buf + len, + max(0,HID_DEBUG_BUFSIZE - len - 1), + "%s", p->description); else seq_printf(f, "%s", p->description); - return; + return buf; } break; } if (!f) - printk("%04x", usage & 0xffff); + snprintf(buf + len, max(0, HID_DEBUG_BUFSIZE - len - 1), + "%04x", usage & 0xffff); else seq_printf(f, "%04x", usage & 0xffff); + return buf; } EXPORT_SYMBOL_GPL(hid_resolv_usage); @@ -508,13 +549,37 @@ void hid_dump_device(struct hid_device *device, struct seq_file *f) } EXPORT_SYMBOL_GPL(hid_dump_device); -void hid_dump_input(struct hid_usage *usage, __s32 value) { - if (hid_debug < 2) +/* enqueue string to 'events' ring buffer */ +void hid_debug_event(struct hid_device *hdev, char *buf) +{ + int i; + struct hid_debug_list *list; + + list_for_each_entry(list, &hdev->debug_list, node) { + for (i = 0; i <= strlen(buf); i++) + list->hid_debug_buf[(list->tail + i) % (HID_DEBUG_BUFSIZE - 1)] = + buf[i]; + list->tail = (list->tail + i) % (HID_DEBUG_BUFSIZE - 1); + } +} +EXPORT_SYMBOL_GPL(hid_debug_event); + +void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 value) +{ + char *buf; + int len; + + buf = hid_resolv_usage(usage->hid, NULL); + if (!buf) return; + len = strlen(buf); + snprintf(buf + len, HID_DEBUG_BUFSIZE - len - 1, " = %d\n", value); + + hid_debug_event(hdev, buf); + + kfree(buf); + wake_up_interruptible(&hdev->debug_wait); - printk(KERN_DEBUG "hid-debug: input "); - hid_resolv_usage(usage->hid, NULL); - printk(" = %d\n", value); } EXPORT_SYMBOL_GPL(hid_dump_input); @@ -808,6 +873,7 @@ void hid_dump_input_mapping(struct hid_device *hid, struct seq_file *f) } + static int hid_debug_rdesc_show(struct seq_file *f, void *p) { struct hid_device *hdev = f->private; @@ -831,6 +897,126 @@ static int hid_debug_rdesc_open(struct inode *inode, struct file *file) return single_open(file, hid_debug_rdesc_show, inode->i_private); } +static int hid_debug_events_open(struct inode *inode, struct file *file) +{ + int err = 0; + struct hid_debug_list *list; + + if (!(list = kzalloc(sizeof(struct hid_debug_list), GFP_KERNEL))) { + err = -ENOMEM; + goto out; + } + + if (!(list->hid_debug_buf = kzalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_KERNEL))) { + err = -ENOMEM; + goto out; + } + list->hdev = (struct hid_device *) inode->i_private; + file->private_data = list; + mutex_init(&list->read_mutex); + + list_add_tail(&list->node, &list->hdev->debug_list); + +out: + return err; +} + +static ssize_t hid_debug_events_read(struct file *file, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct hid_debug_list *list = file->private_data; + int ret = 0, len; + DECLARE_WAITQUEUE(wait, current); + + while (ret == 0) { + mutex_lock(&list->read_mutex); + if (list->head == list->tail) { + add_wait_queue(&list->hdev->debug_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + while (list->head == list->tail) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (!list->hdev || !list->hdev->debug) { + ret = -EIO; + break; + } + + /* allow O_NONBLOCK from other threads */ + mutex_unlock(&list->read_mutex); + schedule(); + mutex_lock(&list->read_mutex); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&list->hdev->debug_wait, &wait); + } + + if (ret) + goto out; + + /* pass the ringbuffer contents to userspace */ +copy_rest: + if (list->tail == list->head) + goto out; + if (list->tail > list->head) { + len = list->tail - list->head; + + if (copy_to_user(buffer + ret, &list->hid_debug_buf[list->head], len)) { + ret = -EFAULT; + goto out; + } + ret += len; + list->head += len; + } else { + len = HID_DEBUG_BUFSIZE - list->head; + + if (copy_to_user(buffer, &list->hid_debug_buf[list->head], len)) { + ret = -EFAULT; + goto out; + } + list->head = 0; + ret += len; + goto copy_rest; + } + + } +out: + mutex_unlock(&list->read_mutex); + return ret; +} + +static unsigned int hid_debug_events_poll(struct file *file, poll_table *wait) +{ + struct hid_debug_list *list = file->private_data; + + poll_wait(file, &list->hdev->debug_wait, wait); + if (list->head != list->tail) + return POLLIN | POLLRDNORM; + if (!list->hdev->debug) + return POLLERR | POLLHUP; + return 0; +} + +static int hid_debug_events_release(struct inode *inode, struct file *file) +{ + struct hid_debug_list *list = file->private_data; + + list_del(&list->node); + kfree(list->hid_debug_buf); + kfree(list); + + return 0; +} + static const struct file_operations hid_debug_rdesc_fops = { .open = hid_debug_rdesc_open, .read = seq_read, @@ -838,16 +1024,31 @@ static const struct file_operations hid_debug_rdesc_fops = { .release = single_release, }; +static const struct file_operations hid_debug_events_fops = { + .owner = THIS_MODULE, + .open = hid_debug_events_open, + .read = hid_debug_events_read, + .poll = hid_debug_events_poll, + .release = hid_debug_events_release, +}; + + void hid_debug_register(struct hid_device *hdev, const char *name) { hdev->debug_dir = debugfs_create_dir(name, hid_debug_root); hdev->debug_rdesc = debugfs_create_file("rdesc", 0400, hdev->debug_dir, hdev, &hid_debug_rdesc_fops); + hdev->debug_events = debugfs_create_file("events", 0400, + hdev->debug_dir, hdev, &hid_debug_events_fops); + hdev->debug = 1; } void hid_debug_unregister(struct hid_device *hdev) { + hdev->debug = 0; + wake_up_interruptible(&hdev->debug_wait); debugfs_remove(hdev->debug_rdesc); + debugfs_remove(hdev->debug_events); debugfs_remove(hdev->debug_dir); } diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index 516e12c33235..ec08ac1ad687 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -24,14 +24,27 @@ #ifdef CONFIG_DEBUG_FS -void hid_dump_input(struct hid_usage *, __s32); +void hid_dump_input(struct hid_device *, struct hid_usage *, __s32); void hid_dump_device(struct hid_device *, struct seq_file *); void hid_dump_field(struct hid_field *, int, struct seq_file *); -void hid_resolv_usage(unsigned, struct seq_file *); +char *hid_resolv_usage(unsigned, struct seq_file *); void hid_debug_register(struct hid_device *, const char *); void hid_debug_unregister(struct hid_device *); void hid_debug_init(void); void hid_debug_exit(void); +void hid_debug_event(struct hid_device *, char *); + +#define HID_DEBUG_BUFSIZE 512 + +struct hid_debug_list { + char *hid_debug_buf; + int head; + int tail; + struct fasync_struct *fasync; + struct hid_device *hdev; + struct list_head node; + struct mutex read_mutex; +}; #else @@ -44,6 +57,7 @@ void hid_debug_exit(void); #define hid_debug_unregister(a) do { } while (0) #define hid_debug_init() do { } while (0) #define hid_debug_exit() do { } while (0) +#define hid_debug_event(a,b) do { } while (0) #endif diff --git a/include/linux/hid.h b/include/linux/hid.h index da09ab140ef1..60fa52913f89 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -451,10 +451,6 @@ struct hid_device { /* device report descriptor */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ - /* debugfs */ - struct dentry *debug_dir; - struct dentry *debug_rdesc; - void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ @@ -468,6 +464,14 @@ struct hid_device { /* device report descriptor */ /* handler for raw output data, used by hidraw */ int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t); + + /* debugging support via debugfs */ + unsigned short debug; + struct dentry *debug_dir; + struct dentry *debug_rdesc; + struct dentry *debug_events; + struct list_head debug_list; + wait_queue_head_t debug_wait; }; static inline void *hid_get_drvdata(struct hid_device *hdev) @@ -625,9 +629,7 @@ struct hid_ll_driver { /* HID core API */ -#ifdef CONFIG_HID_DEBUG extern int hid_debug; -#endif extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); @@ -783,21 +785,9 @@ int hid_pidff_init(struct hid_device *hid); #define hid_pidff_init NULL #endif -#ifdef CONFIG_HID_DEBUG #define dbg_hid(format, arg...) if (hid_debug) \ printk(KERN_DEBUG "%s: " format ,\ __FILE__ , ## arg) -#define dbg_hid_line(format, arg...) if (hid_debug) \ - printk(format, ## arg) -#else -static inline int __attribute__((format(printf, 1, 2))) -dbg_hid(const char *fmt, ...) -{ - return 0; -} -#define dbg_hid_line dbg_hid -#endif /* HID_DEBUG */ - #define err_hid(format, arg...) printk(KERN_ERR "%s: " format "\n" , \ __FILE__ , ## arg) #endif /* HID_FF */ -- cgit v1.2.3-71-gd317 From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c391..05d86407188c 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d4..328d2f8b8c3f 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05db..a774c2acbe69 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d67..f4a3cc62d28f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eabd..7a8698b913fe 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f9..a48a90076d83 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f050..e5000343dd61 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c1..602d724afa5c 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2b..f5934954fa99 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd6426693..c77c6487552f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a47359..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..b14984566f5a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-71-gd317 From 7c756e6e19e71f0327760d8955f7077118ebb2b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:50 +0900 Subject: percpu: implement optional weak percpu definitions Some archs (alpha and s390) need to use weak definitions for percpu variables in modules so that the compiler generates external references for them. This patch implements weak percpu definitions which arch can enable by defining ARCH_NEEDS_WEAK_PER_CPU in arch percpu header file. This weak definition adds the following two restrictions on percpu variable definitions. 1. percpu symbols must be unique whether static or not 2. percpu variables can't be defined inside a function To ensure that these restrictions are observed in generic code, config option DEBUG_FORCE_WEAK_PER_CPU enables weak percpu definitions for all cases. This patch is inspired by Ivan Kokshaysky's alpha percpu patch. [ Impact: stricter rules for percpu variables, one more debug config option ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Howells Cc: Ivan Kokshaysky --- include/linux/percpu-defs.h | 65 ++++++++++++++++++++++++++++++++++++++------- lib/Kconfig.debug | 15 +++++++++++ 2 files changed, 71 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8f921d74f49f..cf32838ad0fa 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -10,21 +10,68 @@ /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the - * 'section' argument. This may be used to affect the parameters governing the + * 'sec' argument. This may be used to affect the parameters governing the * variable's storage. * * NOTE! The sections for the DECLARE and for the DEFINE must match, lest * linkage errors occur due the compiler generating the wrong code to access * that section. */ -#define DECLARE_PER_CPU_SECTION(type, name, section) \ - extern \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name - -#define DEFINE_PER_CPU_SECTION(type, name, section) \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name +#define __PCPU_ATTRS(sec) \ + __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES + +#define __PCPU_DUMMY_ATTRS \ + __attribute__((section(".discard"), unused)) + +/* + * s390 and alpha modules require percpu variables to be defined as + * weak to force the compiler to generate GOT based external + * references for them. This is necessary because percpu sections + * will be located outside of the usually addressable area. + * + * This definition puts the following two extra restrictions when + * defining percpu variables. + * + * 1. The symbol must be globally unique, even the static ones. + * 2. Static percpu variables cannot be defined inside a function. + * + * Archs which need weak percpu definitions should define + * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary. + * + * To ensure that the generic code observes the above two + * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak + * definition is used for all cases. + */ +#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU) +/* + * __pcpu_scope_* dummy variable is used to enforce scope. It + * receives the static modifier when it's used in front of + * DEFINE_PER_CPU() and will trigger build failure if + * DECLARE_PER_CPU() is used for the same variable. + * + * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness + * such that hidden weak symbol collision, which will cause unrelated + * variables to share the same address, can be detected during build. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name +#else +/* + * Normal declaration and definition macros. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name +#endif /* * Variant on the per-CPU variable declaration/definition theme used for diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 23067ab1a73c..77e0d8b1b7c5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -777,6 +777,21 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config DEBUG_FORCE_WEAK_PER_CPU + bool "Force weak per-cpu definitions" + depends on DEBUG_KERNEL + help + s390 and alpha require percpu variables in modules to be + defined weak to work around addressing range issue which + puts the following two restrictions on percpu variable + definitions. + + 1. percpu symbols must be unique whether static or not + 2. percpu variables can't be defined inside a function + + To ensure that generic code follows the above rules, this + option forces all percpu variables to be defined as weak. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL -- cgit v1.2.3-71-gd317 From 38b7f49a0654cb52cac61c6455807248eee3059d Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 26 Jun 2009 10:48:34 +0200 Subject: HID: fix debugfs build with !CONFIG_DEBUG_FS Fix the debug function prototypes to be correct even in the !CONFIG_DEBUG_FS case. Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- include/linux/hid-debug.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h index ec08ac1ad687..53744fa1c8b7 100644 --- a/include/linux/hid-debug.h +++ b/include/linux/hid-debug.h @@ -22,6 +22,8 @@ * */ +#define HID_DEBUG_BUFSIZE 512 + #ifdef CONFIG_DEBUG_FS void hid_dump_input(struct hid_device *, struct hid_usage *, __s32); @@ -34,7 +36,6 @@ void hid_debug_init(void); void hid_debug_exit(void); void hid_debug_event(struct hid_device *, char *); -#define HID_DEBUG_BUFSIZE 512 struct hid_debug_list { char *hid_debug_buf; @@ -48,11 +49,10 @@ struct hid_debug_list { #else -#define hid_dump_input(a,b) do { } while (0) -#define hid_dump_device(c) do { } while (0) -#define hid_dump_field(a,b) do { } while (0) -#define hid_resolv_usage(a) do { } while (0) -#define hid_resolv_event(a,b) do { } while (0) +#define hid_dump_input(a,b,c) do { } while (0) +#define hid_dump_device(a,b) do { } while (0) +#define hid_dump_field(a,b,c) do { } while (0) +#define hid_resolv_usage(a,b) do { } while (0) #define hid_debug_register(a, b) do { } while (0) #define hid_debug_unregister(a) do { } while (0) #define hid_debug_init() do { } while (0) -- cgit v1.2.3-71-gd317 From 1a8dd307cc0a2119be4e578c517795464e6dabba Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jun 2009 17:45:39 +0900 Subject: percpu: use __weak only in the definition of weak percpu variables __weak is necessary only for definition and might even not work in declaration. Drop it from declaration. This change was suggested by Ivan Kokshaysky. Signed-off-by: Tejun Heo Acked-by: Ivan Kokshaysky --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index cf32838ad0fa..9b7a53cc16eb 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -56,7 +56,7 @@ */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ - extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ -- cgit v1.2.3-71-gd317 From b294a290d24d1196d68399cc3a9b8c50bfb55abd Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 30 Jun 2009 02:13:01 -0400 Subject: Revert "power: remove POWER_SUPPLY_PROP_CAPACITY_LEVEL" This reverts commit 8efe444038a205e79b38b7ad03878824901849a8 and 4cbc76eadf56399cd11fb736b33c53aec9caab8c. Richard@laptop.org was apparently using CAPACITY_LEVEL for debugging battery/EC problems, and was upset that it was removed. This readds it. Conflicts: Documentation/power_supply_class.txt Signed-off-by: Andres Salomon Signed-off-by: Anton Vorontsov --- Documentation/power/power_supply_class.txt | 2 ++ drivers/power/olpc_battery.c | 9 +++++++++ drivers/power/power_supply_sysfs.c | 6 ++++++ include/linux/power_supply.h | 10 ++++++++++ 4 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt index c6cd4956047c..709d95571d7b 100644 --- a/Documentation/power/power_supply_class.txt +++ b/Documentation/power/power_supply_class.txt @@ -108,6 +108,8 @@ relative, time-based measurements. ENERGY_FULL, ENERGY_EMPTY - same as above but for energy. CAPACITY - capacity in percents. +CAPACITY_LEVEL - capacity level. This corresponds to +POWER_SUPPLY_CAPACITY_LEVEL_*. TEMP - temperature of the power supply. TEMP_AMBIENT - ambient temperature. diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 58e419299cd6..3a589df09376 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -276,6 +276,14 @@ static int olpc_bat_get_property(struct power_supply *psy, return ret; val->intval = ec_byte; break; + case POWER_SUPPLY_PROP_CAPACITY_LEVEL: + if (ec_byte & BAT_STAT_FULL) + val->intval = POWER_SUPPLY_CAPACITY_LEVEL_FULL; + else if (ec_byte & BAT_STAT_LOW) + val->intval = POWER_SUPPLY_CAPACITY_LEVEL_LOW; + else + val->intval = POWER_SUPPLY_CAPACITY_LEVEL_NORMAL; + break; case POWER_SUPPLY_PROP_TEMP: ret = olpc_ec_cmd(EC_BAT_TEMP, NULL, 0, (void *)&ec_word, 2); if (ret) @@ -321,6 +329,7 @@ static enum power_supply_property olpc_bat_props[] = { POWER_SUPPLY_PROP_VOLTAGE_AVG, POWER_SUPPLY_PROP_CURRENT_AVG, POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_CAPACITY_LEVEL, POWER_SUPPLY_PROP_TEMP, POWER_SUPPLY_PROP_TEMP_AMBIENT, POWER_SUPPLY_PROP_MANUFACTURER, diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index da73591017f9..9deabbde6fd6 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -51,6 +51,9 @@ static ssize_t power_supply_show_property(struct device *dev, "Unknown", "NiMH", "Li-ion", "Li-poly", "LiFe", "NiCd", "LiMn" }; + static char *capacity_level_text[] = { + "Unknown", "Critical", "Low", "Normal", "High", "Full" + }; ssize_t ret; struct power_supply *psy = dev_get_drvdata(dev); const ptrdiff_t off = attr - power_supply_attrs; @@ -71,6 +74,8 @@ static ssize_t power_supply_show_property(struct device *dev, return sprintf(buf, "%s\n", health_text[value.intval]); else if (off == POWER_SUPPLY_PROP_TECHNOLOGY) return sprintf(buf, "%s\n", technology_text[value.intval]); + else if (off == POWER_SUPPLY_PROP_CAPACITY_LEVEL) + return sprintf(buf, "%s\n", capacity_level_text[value.intval]); else if (off >= POWER_SUPPLY_PROP_MODEL_NAME) return sprintf(buf, "%s\n", value.strval); @@ -109,6 +114,7 @@ static struct device_attribute power_supply_attrs[] = { POWER_SUPPLY_ATTR(energy_now), POWER_SUPPLY_ATTR(energy_avg), POWER_SUPPLY_ATTR(capacity), + POWER_SUPPLY_ATTR(capacity_level), POWER_SUPPLY_ATTR(temp), POWER_SUPPLY_ATTR(temp_ambient), POWER_SUPPLY_ATTR(time_to_empty_now), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 594c494ac3f0..0ab6aa171241 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -58,6 +58,15 @@ enum { POWER_SUPPLY_TECHNOLOGY_LiMn, }; +enum { + POWER_SUPPLY_CAPACITY_LEVEL_UNKNOWN = 0, + POWER_SUPPLY_CAPACITY_LEVEL_CRITICAL, + POWER_SUPPLY_CAPACITY_LEVEL_LOW, + POWER_SUPPLY_CAPACITY_LEVEL_NORMAL, + POWER_SUPPLY_CAPACITY_LEVEL_HIGH, + POWER_SUPPLY_CAPACITY_LEVEL_FULL, +}; + enum power_supply_property { /* Properties of type `int' */ POWER_SUPPLY_PROP_STATUS = 0, @@ -89,6 +98,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_ENERGY_NOW, POWER_SUPPLY_PROP_ENERGY_AVG, POWER_SUPPLY_PROP_CAPACITY, /* in percents! */ + POWER_SUPPLY_PROP_CAPACITY_LEVEL, POWER_SUPPLY_PROP_TEMP, POWER_SUPPLY_PROP_TEMP_AMBIENT, POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW, -- cgit v1.2.3-71-gd317 From ee8076ed3e1cdd0cd1e61318386932669c90b92f Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 2 Jul 2009 09:45:18 -0400 Subject: power_supply: Add a charge_type property, and use it for olpc driver This adds a new sysfs file called 'charge_type' which displays the type of charging (unknown, n/a, trickle charge, or fast charging). This allows things like battery diagnostics to determine what the battery/EC is doing without resorting to changing the 'status' sysfs output. Signed-off-by: Andres Salomon Acked-by: Mark Brown Signed-off-by: Anton Vorontsov --- Documentation/power/power_supply_class.txt | 5 +++++ drivers/power/olpc_battery.c | 9 +++++++++ drivers/power/power_supply_sysfs.c | 6 ++++++ include/linux/power_supply.h | 8 ++++++++ 4 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt index 709d95571d7b..9f16c5178b66 100644 --- a/Documentation/power/power_supply_class.txt +++ b/Documentation/power/power_supply_class.txt @@ -76,6 +76,11 @@ STATUS - this attribute represents operating status (charging, full, discharging (i.e. powering a load), etc.). This corresponds to BATTERY_STATUS_* values, as defined in battery.h. +CHARGE_TYPE - batteries can typically charge at different rates. +This defines trickle and fast charges. For batteries that +are already charged or discharging, 'n/a' can be displayed (or +'unknown', if the status is not known). + HEALTH - represents health of the battery, values corresponds to POWER_SUPPLY_HEALTH_*, defined in battery.h. diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 602bbd008f78..8fefe5a73558 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -233,6 +233,14 @@ static int olpc_bat_get_property(struct power_supply *psy, if (ret) return ret; break; + case POWER_SUPPLY_PROP_CHARGE_TYPE: + if (ec_byte & BAT_STAT_TRICKLE) + val->intval = POWER_SUPPLY_CHARGE_TYPE_TRICKLE; + else if (ec_byte & BAT_STAT_CHARGING) + val->intval = POWER_SUPPLY_CHARGE_TYPE_FAST; + else + val->intval = POWER_SUPPLY_CHARGE_TYPE_NONE; + break; case POWER_SUPPLY_PROP_PRESENT: val->intval = !!(ec_byte & (BAT_STAT_PRESENT | BAT_STAT_TRICKLE)); @@ -325,6 +333,7 @@ static int olpc_bat_get_property(struct power_supply *psy, static enum power_supply_property olpc_bat_props[] = { POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_CHARGE_TYPE, POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_TECHNOLOGY, diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index 9deabbde6fd6..08144393d64b 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -43,6 +43,9 @@ static ssize_t power_supply_show_property(struct device *dev, static char *status_text[] = { "Unknown", "Charging", "Discharging", "Not charging", "Full" }; + static char *charge_type[] = { + "Unknown", "N/A", "Trickle", "Fast" + }; static char *health_text[] = { "Unknown", "Good", "Overheat", "Dead", "Over voltage", "Unspecified failure", "Cold", @@ -70,6 +73,8 @@ static ssize_t power_supply_show_property(struct device *dev, if (off == POWER_SUPPLY_PROP_STATUS) return sprintf(buf, "%s\n", status_text[value.intval]); + else if (off == POWER_SUPPLY_PROP_CHARGE_TYPE) + return sprintf(buf, "%s\n", charge_type[value.intval]); else if (off == POWER_SUPPLY_PROP_HEALTH) return sprintf(buf, "%s\n", health_text[value.intval]); else if (off == POWER_SUPPLY_PROP_TECHNOLOGY) @@ -86,6 +91,7 @@ static ssize_t power_supply_show_property(struct device *dev, static struct device_attribute power_supply_attrs[] = { /* Properties of type `int' */ POWER_SUPPLY_ATTR(status), + POWER_SUPPLY_ATTR(charge_type), POWER_SUPPLY_ATTR(health), POWER_SUPPLY_ATTR(present), POWER_SUPPLY_ATTR(online), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 0ab6aa171241..4c7c6fc35487 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -38,6 +38,13 @@ enum { POWER_SUPPLY_STATUS_FULL, }; +enum { + POWER_SUPPLY_CHARGE_TYPE_UNKNOWN = 0, + POWER_SUPPLY_CHARGE_TYPE_NONE, + POWER_SUPPLY_CHARGE_TYPE_TRICKLE, + POWER_SUPPLY_CHARGE_TYPE_FAST, +}; + enum { POWER_SUPPLY_HEALTH_UNKNOWN = 0, POWER_SUPPLY_HEALTH_GOOD, @@ -70,6 +77,7 @@ enum { enum power_supply_property { /* Properties of type `int' */ POWER_SUPPLY_PROP_STATUS = 0, + POWER_SUPPLY_PROP_CHARGE_TYPE, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_ONLINE, -- cgit v1.2.3-71-gd317 From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: percpu: drop @unit_size from embed first chunk allocator The only extra feature @unit_size provides is making dead space at the end of the first chunk which doesn't have any valid usecase. Drop the parameter. This will increase consistency with generalized 4k allocator. James Bottomley spotted missing conversion for the default setup_per_cpu_areas() which caused build breakage on all arcsh which use it. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 2 +- mm/percpu.c | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4a..14728206fb52 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + reserve - PERCPU_FIRST_CHUNK_RESERVE); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e5000343dd61..83bff053bd1c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + ssize_t dyn_size); /* * Use this to get to a cpu's version of the per-cpu object diff --git a/mm/percpu.c b/mm/percpu.c index 19dd83b5cbdc..fc6babe6e554 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * page size. * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. @@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) + ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; @@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, if (dyn_size != 0) dyn_size = pcpue_size - static_size - reserved_size; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, @@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, -1); + PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-71-gd317 From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize 4k first chunk allocator Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk(). setup_pcpu_4k() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for consistency. [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 78 ++++++++++---------------------------- include/linux/percpu.h | 12 +++++- mm/percpu.c | 85 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 14728206fb52..ab896b31e80b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +{ + return pcpu_alloc_bootmem(cpu, size, size); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + /* * Large page remap allocator * @@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k page allocator + * 4k allocator * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. + * Boring fallback 4k allocator. This allocator puts more pressure on + * PTE TLBs but other than that behaves nicely on both UMA and NUMA. */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr) static ssize_t __init setup_pcpu_4k(size_t static_size) { - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; + return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpu4k_populate_pte); } /* for explicit first chunk allocator selection */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 83bff053bd1c..41b5bfab4195 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,18 +59,26 @@ extern void *pcpu_base_addr; typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + pcpu_fc_populate_pte_fn_t populate_pte_fn); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +extern ssize_t __init pcpu_4k_first_chunk( + size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index fc6babe6e554..27b0f40a3ea8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, pcpue_unit_size, pcpue_ptr, NULL); } +/* + * 4k page first chunk setup helper. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +/** + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); + goto enomem; + } + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + reserved_size, -1, + -1, NULL, populate_pte_fn); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + /* * Generic percpu area setup. * -- cgit v1.2.3-71-gd317 From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize lpage first chunk allocator Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 9 -- arch/x86/kernel/setup_percpu.c | 169 +++------------------------------ arch/x86/mm/pageattr.c | 1 + include/linux/percpu.h | 27 ++++++ mm/percpu.c | 209 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 244 insertions(+), 171 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d85..a18c038a3079 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -156,15 +156,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80b..4f2e0ac9130b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size) } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Large page remapping allocator */ #ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void __init pcpul_map(void *ptr, size_t size, void *addr) { - size_t off = (size_t)pageno << PAGE_SHIFT; + pmd_t *pmd, pmd_v; - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + pmd = populate_extra_pmd((unsigned long)addr); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; @@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; + return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a8966..c106f7852424 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab4195..9f6bfd7d4b92 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, @@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#ifdef CONFIG_NEED_MULTIPLE_NODES +extern ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn); + +extern void *pcpu_lpage_remapped(void *kaddr); +#else +static inline ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + return -EINVAL; +} + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378f..17db527ee2e2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, return pcpu_unit_size; } +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /* * Embedding first chunk setup helper. */ @@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); @@ -1390,6 +1400,197 @@ out_free_ar: return ret; } +/* + * Large page remapping first chunk setup helper + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static size_t pcpul_unit_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; + +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpul_size) + return NULL; + + return virt_to_page(pcpul_map[cpu].ptr + off); +} + +/** + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @lpage_size: the size of a large page + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size + * @free_fn: function to free percpu memory, @size <= lpage_size + * @map_fn: function to map percpu lpage, always called with lpage_size + * + * This allocator uses large page as unit. A large page is allocated + * for each cpu and each is remapped into vmalloc area using large + * page mapping. As large page can be quite large, only part of it is + * used for the first chunk. Unused part is returned to the bootmem + * allocator. + * + * So, the large pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more large TLB entry pressure but still is + * much better than only using 4k mappings while still being NUMA + * friendly. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + size_t size_sum; + size_t map_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + + pcpul_unit_size = lpage_size; + pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + if (pcpul_size > pcpul_unit_size) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); + + for_each_possible_cpu(cpu) { + void *ptr; + + ptr = alloc_fn(cpu, lpage_size); + if (!ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); + goto enomem; + } + + /* + * Only use pcpul_size bytes and give back the rest. + * + * Ingo: The lpage_size up-rounding bootmem is needed + * to make sure the partial lpage is still fully RAM - + * it's not well-specified to have a incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_fn(ptr + pcpul_size, lpage_size - pcpul_size); + + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = ptr; + + memcpy(ptr, __per_cpu_load, static_size); + } + + /* allocate address and map */ + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; + vm_area_register_early(&pcpul_vm, pcpul_unit_size); + + for_each_possible_cpu(cpu) + map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, + pcpul_vm.addr + cpu * pcpul_unit_size); + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", pcpul_vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, + reserved_size, dyn_size, pcpul_unit_size, + pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; + +enomem: + for_each_possible_cpu(cpu) + if (pcpul_map[cpu].ptr) + free_fn(pcpul_map[cpu].ptr, pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu large + * page mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used large + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + unsigned long unit_mask = pcpul_unit_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); + unsigned long offset = (unsigned long)kaddr & unit_mask; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < lpage_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > lpage_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * pcpul_unit_size + offset; + } + } + + return NULL; +} +#endif + /* * Generic percpu area setup. * -- cgit v1.2.3-71-gd317 From 38a6be525460f52ac6f2de1c3f73c5615a8853cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: percpu: simplify pcpu_setup_first_chunk() Now that all first chunk allocator helpers allocate and map the first chunk themselves, there's no need to have optional default alloc/map in pcpu_setup_first_chunk(). Drop @populate_pte_fn and only leave @dyn_size optional and make all other params mandatory. This makes it much easier to follow what pcpu_setup_first_chunk() is doing and what actual differences tweaking each parameter results in. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 5 +-- mm/percpu.c | 104 +++++++++++++-------------------------------- 3 files changed, 33 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index fa44eaf8d897..ccad7b20ae75 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1528,7 +1528,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + PCPU_CHUNK_SIZE, vm.addr); free_bootmem(__pa(pcpur_ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9f6bfd7d4b92..ec64357e1762 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -66,9 +66,8 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + ssize_t dyn_size, size_t unit_size, + void *base_addr); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 17db527ee2e2..21d938a10662 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -983,24 +983,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. + * setup path. * * @get_page_fn() should return pointer to percpu page given cpu * number and page number. It should at least return enough pages to * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. + * have been initialized with valid data. It can also return pages + * after the static area. NULL return indicates end of pages for the + * cpu. Note that @get_page_fn() must return the same number of pages + * for all cpus. * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves @@ -1015,17 +1013,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * non-negative value makes percpu leave alone the area beyond * @static_size + @reserved_size + @dyn_size. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. - * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @unit_size specifies unit size and must be aligned to PAGE_SIZE and + * equal to or larger than @static_size + @reserved_size + if + * non-negative, @dyn_size. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1040,9 +1033,8 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn) + ssize_t dyn_size, size_t unit_size, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1050,27 +1042,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; - int nr_pages; - int err, i; + int i, nr_pages; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + BUG_ON(!base_addr); + BUG_ON(unit_size < size_sum); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -1079,6 +1062,10 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; + first_vm.addr = base_addr; + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1101,6 +1088,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; + schunk->immutable = true; if (reserved_size) { schunk->free_size = reserved_size; @@ -1124,31 +1112,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - /* assign pages */ nr_pages = -1; for_each_possible_cpu(cpu) { @@ -1168,19 +1138,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, BUG_ON(nr_pages != i); } - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1282,7 +1239,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(pcpue_get_page, static_size, reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); + pcpue_unit_size, pcpue_ptr); } /* @@ -1387,8 +1344,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr, - NULL); + pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: @@ -1521,7 +1477,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr, NULL); + pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-71-gd317 From ce3141a277ff6cc37e51008b8888dc2cb7456ef1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: drop pcpu_chunk->page[] percpu core doesn't need to tack all the allocated pages. It needs to know whether certain pages are populated and a way to reverse map address to page when freeing. This patch drops pcpu_chunk->page[] and use populated bitmap and vmalloc_to_page() lookup instead. Using vmalloc_to_page() exclusively is also possible but complicates first chunk handling, inflates cache footprint and prevents non-standard memory allocation for percpu memory. pcpu_chunk->page[] was used to track each page's allocation and allowed asymmetric population which happens during failure path; however, with single bitmap for all units, this is no longer possible. Bite the bullet and rewrite (de)populate functions so that things are done in clearly separated steps such that asymmetric population doesn't happen. This makes the (de)population process much more modular and will also ease implementing non-standard memory usage in the future (e.g. large pages). This makes @get_page_fn parameter to pcpu_setup_first_chunk() unnecessary. The parameter is dropped and all first chunk helpers are updated accordingly. Please note that despite the volume most changes to first chunk helpers are symbol renames for variables which don't need to be referenced outside of the helper anymore. This change reduces memory usage and cache footprint of pcpu_chunk. Now only #unit_pages bits are necessary per chunk. [ Impact: reduced memory usage and cache footprint for bookkeeping ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 42 ++-- include/linux/percpu.h | 3 +- mm/percpu.c | 604 ++++++++++++++++++++++++++++----------------- 3 files changed, 400 insertions(+), 249 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ccad7b20ae75..f2f22ee97a7a 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); -} - #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) static void __init pcpu_map_range(unsigned long start, unsigned long end, @@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void) size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; unsigned long delta, cpu; - size_t pcpu_unit_size; + size_t size_sum, pcpu_unit_size; size_t ptrs_size; + void **ptrs; - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; + size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); + dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0])); + ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, - PCPU_CHUNK_SIZE); + ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, + PCPU_CHUNK_SIZE); - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PCPU_CHUNK_SIZE - pcpur_size); + free_bootmem(__pa(ptrs[cpu] + size_sum), + PCPU_CHUNK_SIZE - size_sum); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void) start += cpu * PCPU_CHUNK_SIZE; end = start + PCPU_CHUNK_SIZE; - pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); + pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, + pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, PCPU_CHUNK_SIZE, vm.addr); - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ec64357e1762..63c8b7a23e66 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -58,13 +58,12 @@ extern void *pcpu_base_addr; -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, +extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr); diff --git a/mm/percpu.c b/mm/percpu.c index 639fce4d2caf..21756814d99f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,8 +94,7 @@ struct pcpu_chunk { int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; @@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); } -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; -} + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) -{ - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * @chunk: chunk of interest + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array + * + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. + */ +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) +{ + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = num_possible_cpus() * pcpu_unit_pages * + sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } + + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + + *bitmapp = bitmap; + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 - * @flush_tlb: whether to flush tlb or not * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush_tlb) +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; unsigned int cpu; + int i; - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush_tlb) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), - pcpu_chunk_pagep(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) - return err; + goto err; } + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); + } + + return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + /* flush at once, please read comments in pcpu_unmap() */ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); - return 0; } /** @@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); - unsigned int cpu; - int i; + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; + } - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); - if (!*pagep) - continue; + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); - __free_page(*pagep); + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; - } - } + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; + } - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); - return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) @@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { @@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work) mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } } @@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu); /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto @@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu); * perpcu area. This function is to be called from arch percpu area * setup path. * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. It can also return pages - * after the static area. NULL return indicates end of pages for the - * cpu. Note that @get_page_fn() must return the same number of pages - * for all cpus. - * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved @@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr) { @@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu; - int i, nr_pages; + int i; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || @@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); if (reserved_size) { schunk->free_size = reserved_size; @@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); dchunk->vm = &first_vm; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpue_size) - return NULL; - - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); -} - /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size) { - size_t chunk_size; + size_t size_sum, unit_size, chunk_size; + void *base; unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + chunk_size = unit_size * num_possible_cpus(); - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) { + base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); return -ENOMEM; @@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, /* return the leftover and copy */ for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + void *ptr = base + cpu * unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); + free_bootmem(__pa(ptr + size_sum), unit_size - size_sum); memcpy(ptr, __per_cpu_load, static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + size_sum >> PAGE_SHIFT, base, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr); -} - -/* - * 4k page first chunk setup helper. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_unit_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_unit_pages) - return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno]; - return NULL; + return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + unit_size, base); } /** @@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + int unit_pages; size_t pages_size; + struct page **pages; unsigned int cpu; int i, j; ssize_t ret; - pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, - PCPU_MIN_UNIT_SIZE)); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, + PCPU_MIN_UNIT_SIZE)); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * - sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_unit_pages; i++) { + for (i = 0; i < unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE); @@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, "4k page for cpu%u\n", cpu); goto enomem; } - pcpu4k_pages[j++] = virt_to_page(ptr); + pages[j++] = virt_to_page(ptr); } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; + vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { unsigned long unit_addr = (unsigned long)vm.addr + - (cpu * pcpu4k_unit_pages << PAGE_SHIFT); + (cpu * unit_pages << PAGE_SHIFT); - for (i = 0; i < pcpu4k_unit_pages; i++) + for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, - &pcpu4k_pages[cpu * pcpu4k_unit_pages], - pcpu4k_unit_pages); + ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, /* we're ready, commit */ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", - pcpu4k_unit_pages, static_size); + unit_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, + unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: while (--j >= 0) - free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + free_fn(page_address(pages[j]), PAGE_SIZE); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); + free_bootmem(__pa(pages), pages_size); return ret; } @@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size; static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); -} - /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes @@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + pcpul_unit_size, pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-71-gd317 From 2f39e637ea240efb74cf807d31c93a71a0b89174 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: allow non-linear / sparse cpu -> unit mapping Currently cpu and unit are always identity mapped. To allow more efficient large page support on NUMA and lazy allocation for possible but offline cpus, cpu -> unit mapping needs to be non-linear and/or sparse. This can be easily implemented by adding a cpu -> unit mapping array and using it whenever looking up the matching unit for a cpu. The only unusal conversion is in pcpu_chunk_addr_search(). The passed in address is unit0 based and unit0 might not be in use so it needs to be converted to address of an in-use unit. This is easily done by adding the unit offset for the current processor. [ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 3 +- mm/percpu.c | 129 +++++++++++++++++++++++++++++++++------------ 3 files changed, 97 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f2f22ee97a7a..6970333b48b8 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1516,7 +1516,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr); + PCPU_CHUNK_SIZE, vm.addr, NULL); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 63c8b7a23e66..1e0e8878dc2a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,6 +57,7 @@ #endif extern void *pcpu_base_addr; +extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); @@ -66,7 +67,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr); + void *base_addr, const int *unit_map); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 21756814d99f..2196fae24f00 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -99,14 +102,22 @@ struct pcpu_chunk { static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; +static int pcpu_nr_units __read_mostly; static int pcpu_chunk_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* cpu -> unit map */ +const int *pcpu_unit_map __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, @@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, { static struct page **pages; static unsigned long *bitmap; - size_t pages_size = num_possible_cpus() * pcpu_unit_pages * - sizeof(pages[0]); + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -835,11 +851,9 @@ err: static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } /** @@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) bitmap_copy(chunk->populated, populated, pcpu_unit_pages); clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; err_unmap: @@ -1088,6 +1101,7 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); + /* return address relative to unit0 */ return __addr_to_pcpu_ptr(chunk->vm->addr + off); fail_unlock: @@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address + * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area @@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr) + void *base_addr, const int *unit_map) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned int cpu, tcpu; int i; - /* santiy checks */ + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); @@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + /* determine number of units and verify and initialize pcpu_unit_map */ + if (unit_map) { + int first_unit = INT_MAX, last_unit = INT_MIN; + + for_each_possible_cpu(cpu) { + int unit = unit_map[cpu]; + + BUG_ON(unit < 0); + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + /* the mapping should be one-to-one */ + BUG_ON(unit_map[tcpu] == unit); + } + + if (unit < first_unit) { + pcpu_first_unit_cpu = cpu; + first_unit = unit; + } + if (unit > last_unit) { + pcpu_last_unit_cpu = cpu; + last_unit = unit; + } + } + pcpu_nr_units = last_unit + 1; + pcpu_unit_map = unit_map; + } else { + int *identity_map; + + /* #units == #cpus, identity mapped */ + identity_map = alloc_bootmem(num_possible_cpus() * + sizeof(identity_map[0])); + + for_each_possible_cpu(cpu) + identity_map[cpu] = cpu; + + pcpu_first_unit_cpu = 0; + pcpu_last_unit_cpu = pcpu_nr_units - 1; + pcpu_nr_units = num_possible_cpus(); + pcpu_unit_map = identity_map; + } + + /* determine basic parameters */ pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); + pcpu_base_addr = schunk->vm->addr; return pcpu_unit_size; } @@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, size_sum >> PAGE_SHIFT, base, static_size); return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base); + unit_size, base, NULL); } /** @@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, unit_pages, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, - unit_pages << PAGE_SHIFT, vm.addr); + unit_pages << PAGE_SHIFT, vm.addr, NULL); goto out_free_ar; enomem: @@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr); + pcpul_unit_size, pcpul_vm.addr, NULL); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) -- cgit v1.2.3-71-gd317 From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: teach large page allocator about NUMA Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Jan Beulich Cc: Andi Kleen Cc: David Miller --- arch/x86/kernel/setup_percpu.c | 72 +++++++-- include/linux/percpu.h | 24 ++- mm/percpu.c | 358 ++++++++++++++++++++++++++++++++--------- 3 files changed, 359 insertions(+), 95 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130b..7501bb14bd51 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) set_pmd(pmd, pmd_v); } +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +{ + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; + size_t unit_map_size, unit_size; + int *unit_map; + int nr_units; + ssize_t ret; + + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) + return -EINVAL; + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* allocate and build unit_map */ + unit_map_size = num_possible_cpus() * sizeof(int); + unit_map = alloc_bootmem_nopanic(unit_map_size); + if (!unit_map) { + pr_warning("PERCPU: failed to allocate unit_map\n"); + return -ENOMEM; + } + + ret = pcpu_lpage_build_unit_map(static_size, + PERCPU_FIRST_CHUNK_RESERVE, + &dyn_size, &unit_size, PMD_SIZE, + unit_map, pcpu_lpage_cpu_distance); + if (ret < 0) { + pr_warning("PERCPU: failed to build unit_map\n"); + goto out_free; + } + nr_units = ret; + + /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; + size_t tot_size = nr_units * unit_size; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - return -EINVAL; + ret = -EINVAL; + goto out_free; } } - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, unit_size, PMD_SIZE, + unit_map, nr_units, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); +out_free: + if (ret < 0) + free_bootmem(__pa(unit_map), unit_map_size); + return ret; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) @@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = + delta + pcpu_unit_map[cpu] * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2a..8ce91af4aa19 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( @@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_populate_pte_fn_t populate_pte_fn); #ifdef CONFIG_NEED_MULTIPLE_NODES +extern int __init pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + extern ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); extern void *pcpu_lpage_remapped(void *kaddr); #else +static inline int pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + return -EINVAL; +} + static inline ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f00..b3d0bcff8c7c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1594,75 +1595,259 @@ out_free_ar: * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES + +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + struct pcpul_ent { - unsigned int cpu; void *ptr; + void *map_addr; }; static size_t pcpul_size; -static size_t pcpul_unit_size; +static size_t pcpul_lpage_size; +static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + if (unit_map[cpu] == unit) { + if (cpup) + *cpup = cpu; + return true; + } + + return false; +} + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes + * @unit_size: unit size in bytes * @lpage_size: the size of a large page + * @unit_map: cpu -> unit mapping + * @nr_units: the number of units * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * - * This allocator uses large page as unit. A large page is allocated - * for each cpu and each is remapped into vmalloc area using large - * page mapping. As large page can be quite large, only part of it is - * used for the first chunk. Unused part is returned to the bootmem - * allocator. - * - * So, the large pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more large TLB entry pressure but still is - * much better than only using 4k mappings while still being NUMA - * friendly. + * This allocator uses large page to build and map the first chunk. + * Unlike other helpers, the caller should always specify @dyn_size + * and @unit_size. These parameters along with @unit_map and + * @nr_units can be determined using pcpu_lpage_build_unit_map(). + * This two stage initialization is to allow arch code to evaluate the + * parameters before committing to it. + * + * Large pages are allocated as directed by @unit_map and other + * parameters and mapped to vmalloc space. Unused holes are returned + * to the page allocator. Note that these holes end up being actively + * mapped twice - once to the physical mapping and to the vmalloc area + * for the first percpu chunk. Depending on architecture, this might + * cause problem when changing page attributes of the returned area. + * These double mapped areas can be detected using + * pcpu_lpage_remapped(). * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { - size_t size_sum; + static struct vm_struct vm; + size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; - int i, j; ssize_t ret; + int i, j, unit; - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, + unit_size, lpage_size, unit_map, nr_units); - pcpul_unit_size = lpage_size; - pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - if (pcpul_size > pcpul_unit_size) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } + BUG_ON(chunk_size % lpage_size); + + pcpul_size = static_size + reserved_size + dyn_size; + pcpul_lpage_size = lpage_size; + pcpul_nr_lpages = chunk_size / lpage_size; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); pcpul_map = alloc_bootmem(map_size); - for_each_possible_cpu(cpu) { + /* allocate all pages */ + for (i = 0; i < pcpul_nr_lpages; i++) { + size_t offset = i * lpage_size; + int first_unit = offset / unit_size; + int last_unit = (offset + lpage_size - 1) / unit_size; void *ptr; + /* find out which cpu is mapped to this unit */ + for (unit = first_unit; unit <= last_unit; unit++) + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + goto found; + continue; + found: ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " @@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, goto enomem; } - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The lpage_size up-rounding bootmem is needed - * to make sure the partial lpage is still fully RAM - - * it's not well-specified to have a incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_fn(ptr + pcpul_size, lpage_size - pcpul_size); - - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = ptr; + pcpul_map[i].ptr = ptr; + } - memcpy(ptr, __per_cpu_load, static_size); + /* return unused holes */ + for (unit = 0; unit < nr_units; unit++) { + size_t start = unit * unit_size; + size_t end = start + unit_size; + size_t off, next; + + /* don't free used part of occupied unit */ + if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + start += pcpul_size; + + /* unit can span more than one page, punch the holes */ + for (off = start; off < end; off = next) { + void *ptr = pcpul_map[off / lpage_size].ptr; + next = min(roundup(off + 1, lpage_size), end); + if (ptr) + free_fn(ptr + off % lpage_size, next - off); + } } - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; - vm_area_register_early(&pcpul_vm, pcpul_unit_size); + /* allocate address, map and copy */ + vm.flags = VM_ALLOC; + vm.size = chunk_size; + vm_area_register_early(&vm, unit_size); + + for (i = 0; i < pcpul_nr_lpages; i++) { + if (!pcpul_map[i].ptr) + continue; + pcpul_map[i].map_addr = vm.addr + i * lpage_size; + map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); + } for_each_possible_cpu(cpu) - map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, - pcpul_vm.addr + cpu * pcpul_unit_size); + memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, + static_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); + "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } + unit_size, vm.addr, unit_map); + + /* + * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped + * lpages are pushed to the end and trimmed. + */ + for (i = 0; i < pcpul_nr_lpages - 1; i++) + for (j = i + 1; j < pcpul_nr_lpages; j++) { + struct pcpul_ent tmp; + + if (!pcpul_map[j].ptr) + continue; + if (pcpul_map[i].ptr && + pcpul_map[i].ptr < pcpul_map[j].ptr) + continue; + + tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) + pcpul_nr_lpages--; return ret; enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_fn(pcpul_map[cpu].ptr, pcpul_size); + for (i = 0; i < pcpul_nr_lpages; i++) + if (pcpul_map[i].ptr) + free_fn(pcpul_map[i].ptr, lpage_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } @@ -1739,10 +1950,10 @@ enomem: */ void *pcpu_lpage_remapped(void *kaddr) { - unsigned long unit_mask = pcpul_unit_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); - unsigned long offset = (unsigned long)kaddr & unit_mask; - int left = 0, right = num_possible_cpus() - 1; + unsigned long lpage_mask = pcpul_lpage_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); + unsigned long offset = (unsigned long)kaddr & lpage_mask; + int left = 0, right = pcpul_nr_lpages - 1; int pos; /* pcpul in use at all? */ @@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * pcpul_unit_size + offset; - } + else + return pcpul_map[pos].map_addr + offset; } return NULL; -- cgit v1.2.3-71-gd317 From 7a6d3c8b3049d07123628f2bf57127bba2cc878f Mon Sep 17 00:00:00 2001 From: Csaba Henk Date: Wed, 1 Jul 2009 17:28:41 -0700 Subject: fuse: make the number of max background requests and congestion threshold tunable The practical values for these limits depend on the design of the filesystem server so let userspace set them at initialization time. Signed-off-by: Csaba Henk Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 10 +++++----- fs/fuse/fuse_i.h | 12 ++++++------ fs/fuse/inode.c | 14 ++++++++++++++ include/linux/fuse.h | 9 +++++++-- 4 files changed, 32 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f58ecbc416c8..b152761c1bf6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) static void flush_bg_queue(struct fuse_conn *fc) { - while (fc->active_background < FUSE_MAX_BACKGROUND && + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; @@ -280,11 +280,11 @@ __releases(&fc->lock) list_del(&req->intr_entry); req->state = FUSE_REQ_FINISHED; if (req->background) { - if (fc->num_background == FUSE_MAX_BACKGROUND) { + if (fc->num_background == fc->max_background) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } - if (fc->num_background == FUSE_CONGESTION_THRESHOLD && + if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { clear_bdi_congested(&fc->bdi, READ); clear_bdi_congested(&fc->bdi, WRITE); @@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, { req->background = 1; fc->num_background++; - if (fc->num_background == FUSE_MAX_BACKGROUND) + if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == FUSE_CONGESTION_THRESHOLD && + if (fc->num_background == fc->congestion_threshold && fc->bdi_initialized) { set_bdi_congested(&fc->bdi, READ); set_bdi_congested(&fc->bdi, WRITE); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 52b641fc0faf..6bcfab04396f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -25,12 +25,6 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** Maximum number of outstanding background requests */ -#define FUSE_MAX_BACKGROUND 12 - -/** Congestion starts at 75% of maximum */ -#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100) - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -349,6 +343,12 @@ struct fuse_conn { /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; + /** Maximum number of outstanding background requests */ + unsigned max_background; + + /** Number of background requests at which congestion starts */ + unsigned congestion_threshold; + /** Number of requests currently in the background */ unsigned num_background; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f91ccc4a189d..9aa6f46d0c32 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -32,6 +32,12 @@ DEFINE_MUTEX(fuse_mutex); #define FUSE_DEFAULT_BLKSIZE 512 +/** Maximum number of outstanding background requests */ +#define FUSE_DEFAULT_MAX_BACKGROUND 12 + +/** Congestion starts at 75% of maximum */ +#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) + struct fuse_mount_data { int fd; unsigned rootmode; @@ -517,6 +523,8 @@ void fuse_conn_init(struct fuse_conn *fc) INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); atomic_set(&fc->num_waiting, 0); + fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; + fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; fc->reqctr = 0; @@ -736,6 +744,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) else { unsigned long ra_pages; + if (arg->minor >= 13) { + if (arg->max_background) + fc->max_background = arg->max_background; + if (arg->congestion_threshold) + fc->congestion_threshold = arg->congestion_threshold; + } if (arg->minor >= 6) { ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; if (arg->flags & FUSE_ASYNC_READ) diff --git a/include/linux/fuse.h b/include/linux/fuse.h index cf593bf9fd32..b3700f0ac268 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -30,6 +30,10 @@ * - add umask flag to input argument of open, mknod and mkdir * - add notification messages for invalidation of inodes and * directory entries + * + * 7.13 + * - make max number of background requests and congestion threshold + * tunables */ #ifndef _LINUX_FUSE_H @@ -41,7 +45,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 12 +#define FUSE_KERNEL_MINOR_VERSION 13 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -427,7 +431,8 @@ struct fuse_init_out { __u32 minor; __u32 max_readahead; __u32 flags; - __u32 unused; + __u16 max_background; + __u16 congestion_threshold; __u32 max_write; }; -- cgit v1.2.3-71-gd317 From 37d217f029a56a6d385f99773fb27dfcb51f9a46 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 8 Jul 2009 18:17:58 +0200 Subject: fuse: document protocol version negotiation Clarify how the protocol version should be negotiated between kernel and userspace. Notably libfuse didn't correctly handle the case when the supported major versions didn't match. Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index b3700f0ac268..3e2925a34bf0 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -41,6 +41,26 @@ #include +/* + * Version negotiation: + * + * Both the kernel and userspace send the version they support in the + * INIT request and reply respectively. + * + * If the major versions match then both shall use the smallest + * of the two minor versions for communication. + * + * If the kernel supports a larger major version, then userspace shall + * reply with the major version it supports, ignore the rest of the + * INIT message and expect a new INIT message from the kernel with a + * matching major version. + * + * If the library supports a larger major version, then it shall fall + * back to the major protocol version sent by the kernel for + * communication and reply with that major version (and an arbitrary + * supported minor version). + */ + /** Version number of this interface */ #define FUSE_KERNEL_VERSION 7 -- cgit v1.2.3-71-gd317 From 5a421ce3c062a87db0a9e7f2a0a7ee0a5b869aab Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 10 Jul 2009 12:37:40 +0300 Subject: nfsd41: gather and report statistics also for v4.1 ops Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- include/linux/nfs4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index bd2eba530667..aff924a24abb 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -234,7 +234,7 @@ enum nfs_opnum4 { Needs to be updated if more operations are defined in future.*/ #define FIRST_NFS4_OP OP_ACCESS -#define LAST_NFS4_OP OP_RELEASE_LOCKOWNER +#define LAST_NFS4_OP OP_RECLAIM_COMPLETE enum nfsstat4 { NFS4_OK = 0, -- cgit v1.2.3-71-gd317 From 4bd9b0f4afc76cf972578c702e1bc1b6f2d10ba5 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 24 Jun 2009 15:37:45 -0400 Subject: nfsd41: use globals for DRC limits The version 4.1 DRC memory limit and tracking variables are server wide and session specific. Replace struct svc_serv fields with globals. Stop using the svc_serv sv_lock. Add a spinlock to serialize access to the DRC limit management variables which change on session creation and deletion (usage counter) or (future) administrative action to adjust the total DRC memory limit. Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy --- fs/nfsd/nfs4state.c | 10 +++++----- fs/nfsd/nfssvc.c | 19 +++++++++++++++---- include/linux/nfsd/nfsd.h | 3 +++ include/linux/sunrpc/svc.h | 2 -- 4 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 980a216a48c8..2e6a44e3d2fe 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -430,11 +430,11 @@ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - spin_lock(&nfsd_serv->sv_lock); - if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) - np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; - nfsd_serv->sv_drc_pages_used += np; - spin_unlock(&nfsd_serv->sv_lock); + spin_lock(&nfsd_drc_lock); + if (np + nfsd_drc_pages_used > nfsd_drc_max_pages) + np = nfsd_drc_max_pages - nfsd_drc_pages_used; + nfsd_drc_pages_used += np; + spin_unlock(&nfsd_drc_lock); if (np <= 0) { status = nfserr_resource; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d4c9884cd54b..78d8fcd883fb 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -67,6 +67,16 @@ struct timeval nfssvc_boot; DEFINE_MUTEX(nfsd_mutex); struct svc_serv *nfsd_serv; +/* + * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. + * nfsd_drc_max_pages limits the total amount of memory available for + * version 4.1 DRC caches. + * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. + */ +spinlock_t nfsd_drc_lock; +unsigned int nfsd_drc_max_pages; +unsigned int nfsd_drc_pages_used; + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -238,11 +248,12 @@ static void set_max_drc(void) { /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ #define NFSD_DRC_SIZE_SHIFT 7 - nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() + nfsd_drc_max_pages = nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT; - nfsd_serv->sv_drc_pages_used = 0; - dprintk("%s svc_drc_max_pages %u\n", __func__, - nfsd_serv->sv_drc_max_pages); + nfsd_drc_pages_used = 0; + spin_lock_init(&nfsd_drc_lock); + dprintk("%s nfsd_drc_max_pages %u\n", __func__, + nfsd_drc_max_pages); } int nfsd_create_serv(void) diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 2b49d676d0c9..2571f856908f 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -56,6 +56,9 @@ extern struct svc_version nfsd_version2, nfsd_version3, extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; extern struct svc_serv *nfsd_serv; +extern spinlock_t nfsd_drc_lock; +extern unsigned int nfsd_drc_max_pages; +extern unsigned int nfsd_drc_pages_used; extern struct seq_operations nfs_exports_op; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ea8009695c69..52e8cb0a7569 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -94,8 +94,6 @@ struct svc_serv { struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ - unsigned int sv_drc_max_pages; /* Total pages for DRC */ - unsigned int sv_drc_pages_used;/* DRC pages used */ #if defined(CONFIG_NFS_V4_1) struct list_head sv_cb_list; /* queue for callback requests * that arrive over the same -- cgit v1.2.3-71-gd317 From 3d39cecc4841e8d4c4abdb401d10180f5faaded0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 8 Jul 2009 15:23:30 +0100 Subject: intel-iommu: Remove superfluous iova_alloc_lock from IOVA code We only ever obtain this lock immediately before the iova_rbtree_lock, and release it immediately after the iova_rbtree_lock. So ditch it and just use iova_rbtree_lock. [v2: Remove the lockdep bits this time too] Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 3 --- drivers/pci/iova.c | 16 ++++------------ include/linux/iova.h | 1 - 3 files changed, 4 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index c5f7c73cbb55..d6a857397ec3 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1309,7 +1309,6 @@ static void iommu_detach_domain(struct dmar_domain *domain, } static struct iova_domain reserved_iova_list; -static struct lock_class_key reserved_alloc_key; static struct lock_class_key reserved_rbtree_key; static void dmar_init_reserved_ranges(void) @@ -1320,8 +1319,6 @@ static void dmar_init_reserved_ranges(void) init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN); - lockdep_set_class(&reserved_iova_list.iova_alloc_lock, - &reserved_alloc_key); lockdep_set_class(&reserved_iova_list.iova_rbtree_lock, &reserved_rbtree_key); diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index 46dd440e2315..7914951ef29a 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -22,7 +22,6 @@ void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit) { - spin_lock_init(&iovad->iova_alloc_lock); spin_lock_init(&iovad->iova_rbtree_lock); iovad->rbroot = RB_ROOT; iovad->cached32_node = NULL; @@ -205,7 +204,6 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, unsigned long limit_pfn, bool size_aligned) { - unsigned long flags; struct iova *new_iova; int ret; @@ -219,11 +217,9 @@ alloc_iova(struct iova_domain *iovad, unsigned long size, if (size_aligned) size = __roundup_pow_of_two(size); - spin_lock_irqsave(&iovad->iova_alloc_lock, flags); ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn, new_iova, size_aligned); - spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); if (ret) { free_iova_mem(new_iova); return NULL; @@ -381,8 +377,7 @@ reserve_iova(struct iova_domain *iovad, struct iova *iova; unsigned int overlap = 0; - spin_lock_irqsave(&iovad->iova_alloc_lock, flags); - spin_lock(&iovad->iova_rbtree_lock); + spin_lock_irqsave(&iovad->iova_rbtree_lock, flags); for (node = rb_first(&iovad->rbroot); node; node = rb_next(node)) { if (__is_range_overlap(node, pfn_lo, pfn_hi)) { iova = container_of(node, struct iova, node); @@ -402,8 +397,7 @@ reserve_iova(struct iova_domain *iovad, iova = __insert_new_range(iovad, pfn_lo, pfn_hi); finish: - spin_unlock(&iovad->iova_rbtree_lock); - spin_unlock_irqrestore(&iovad->iova_alloc_lock, flags); + spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags); return iova; } @@ -420,8 +414,7 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) unsigned long flags; struct rb_node *node; - spin_lock_irqsave(&from->iova_alloc_lock, flags); - spin_lock(&from->iova_rbtree_lock); + spin_lock_irqsave(&from->iova_rbtree_lock, flags); for (node = rb_first(&from->rbroot); node; node = rb_next(node)) { struct iova *iova = container_of(node, struct iova, node); struct iova *new_iova; @@ -430,6 +423,5 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) printk(KERN_ERR "Reserve iova range %lx@%lx failed\n", iova->pfn_lo, iova->pfn_lo); } - spin_unlock(&from->iova_rbtree_lock); - spin_unlock_irqrestore(&from->iova_alloc_lock, flags); + spin_unlock_irqrestore(&from->iova_rbtree_lock, flags); } diff --git a/include/linux/iova.h b/include/linux/iova.h index 228f6c94b69c..76a0759e88ec 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -28,7 +28,6 @@ struct iova { /* holds all the iova translations for a domain */ struct iova_domain { - spinlock_t iova_alloc_lock;/* Lock to protect iova allocation */ spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */ struct rb_root rbroot; /* iova domain rbtree root */ struct rb_node *cached32_node; /* Save last alloced node */ -- cgit v1.2.3-71-gd317 From a76761b621bcd8336065c4fe3a74f046858bc34c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Jul 2009 23:35:14 +0900 Subject: percpu: add dummy pcpu_lpage_remapped() for !CONFIG_SMP !CONFIG_SMP was missing pcpu_lpage_remapped() definition causing build failure. Add dummy implementation. This was discovered by linux-next testing. Signed-off-by: Tejun Heo Cc: Randy Dunlap Cc: Kamalesh Babulal Cc: Stephen Rothwell --- include/linux/percpu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 8ce91af4aa19..e134c8229631 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -184,6 +184,11 @@ static inline void free_percpu(void *p) static inline void __init setup_per_cpu_areas(void) { } +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ -- cgit v1.2.3-71-gd317 From 719a72b7c75bb239ca6184190ab994b71a31c6dc Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 17 Jul 2009 14:59:55 +0000 Subject: usb: r8a66597-hcd platform data on_chip support Convert the r8a66597-hcd driver to use the on_chip flag from platform data to enable on chip behaviour instead of relying on CONFIG_SUPERH_ON_CHIP_R8A66597 ugliness. This makes the code cleaner and also allows us to support both external and internal r8a66597 with the same kernel. It also makes the Kconfig part more future proof since we with this patch can add support for new processors with on-chip r8a66597 without modifying the Kconfig. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7724/setup.c | 1 + arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 2 +- arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 2 +- drivers/usb/host/Kconfig | 7 -- drivers/usb/host/r8a66597-hcd.c | 187 +++++++++++++++++++-------------- drivers/usb/host/r8a66597.h | 76 ++++++-------- include/linux/usb/r8a66597.h | 3 + 7 files changed, 147 insertions(+), 131 deletions(-) (limited to 'include/linux') diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index 8fed45a2fb85..4fb7e48e2843 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -304,6 +304,7 @@ static struct platform_device sh_eth_device = { }; static struct r8a66597_platdata sh7724_usb0_host_data = { + .on_chip = 1, }; static struct resource sh7724_usb0_host_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index c18f7d09281b..f6d208813564 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -40,7 +40,7 @@ static struct platform_device iic_device = { }; static struct r8a66597_platdata r8a66597_data = { - /* This set zero to all members */ + .on_chip = 1, }; static struct resource usb_host_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index e1bb80b2a27b..28516499a2c4 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -398,7 +398,7 @@ static struct platform_device rtc_device = { }; static struct r8a66597_platdata r8a66597_data = { - /* This set zero to all members */ + .on_chip = 1, }; static struct resource sh7723_usb_host_resources[] = { diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 1a920c70b5a1..f21ca7d27a43 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -336,13 +336,6 @@ config USB_R8A66597_HCD To compile this driver as a module, choose M here: the module will be called r8a66597-hcd. -config SUPERH_ON_CHIP_R8A66597 - boolean "Enable SuperH on-chip R8A66597 USB" - depends on USB_R8A66597_HCD && (CPU_SUBTYPE_SH7366 || CPU_SUBTYPE_SH7723 || CPU_SUBTYPE_SH7724) - help - This driver enables support for the on-chip R8A66597 in the - SH7366, SH7723 and SH7724 processors. - config USB_WHCI_HCD tristate "Wireless USB Host Controller Interface (WHCI) driver (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c index 09895a97c10b..82dce3e0d4d7 100644 --- a/drivers/usb/host/r8a66597-hcd.c +++ b/drivers/usb/host/r8a66597-hcd.c @@ -91,43 +91,43 @@ static int r8a66597_clock_enable(struct r8a66597 *r8a66597) u16 tmp; int i = 0; -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) -#if defined(CONFIG_HAVE_CLK) - clk_enable(r8a66597->clk); + if (r8a66597->pdata->on_chip) { +#ifdef CONFIG_HAVE_CLK + clk_enable(r8a66597->clk); #endif - do { - r8a66597_write(r8a66597, SCKE, SYSCFG0); - tmp = r8a66597_read(r8a66597, SYSCFG0); - if (i++ > 1000) { - printk(KERN_ERR "r8a66597: register access fail.\n"); - return -ENXIO; - } - } while ((tmp & SCKE) != SCKE); - r8a66597_write(r8a66597, 0x04, 0x02); -#else - do { - r8a66597_write(r8a66597, USBE, SYSCFG0); - tmp = r8a66597_read(r8a66597, SYSCFG0); - if (i++ > 1000) { - printk(KERN_ERR "r8a66597: register access fail.\n"); - return -ENXIO; - } - } while ((tmp & USBE) != USBE); - r8a66597_bclr(r8a66597, USBE, SYSCFG0); - r8a66597_mdfy(r8a66597, get_xtal_from_pdata(r8a66597->pdata), XTAL, - SYSCFG0); + do { + r8a66597_write(r8a66597, SCKE, SYSCFG0); + tmp = r8a66597_read(r8a66597, SYSCFG0); + if (i++ > 1000) { + printk(KERN_ERR "r8a66597: reg access fail.\n"); + return -ENXIO; + } + } while ((tmp & SCKE) != SCKE); + r8a66597_write(r8a66597, 0x04, 0x02); + } else { + do { + r8a66597_write(r8a66597, USBE, SYSCFG0); + tmp = r8a66597_read(r8a66597, SYSCFG0); + if (i++ > 1000) { + printk(KERN_ERR "r8a66597: reg access fail.\n"); + return -ENXIO; + } + } while ((tmp & USBE) != USBE); + r8a66597_bclr(r8a66597, USBE, SYSCFG0); + r8a66597_mdfy(r8a66597, get_xtal_from_pdata(r8a66597->pdata), + XTAL, SYSCFG0); - i = 0; - r8a66597_bset(r8a66597, XCKE, SYSCFG0); - do { - msleep(1); - tmp = r8a66597_read(r8a66597, SYSCFG0); - if (i++ > 500) { - printk(KERN_ERR "r8a66597: register access fail.\n"); - return -ENXIO; - } - } while ((tmp & SCKE) != SCKE); -#endif /* #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) */ + i = 0; + r8a66597_bset(r8a66597, XCKE, SYSCFG0); + do { + msleep(1); + tmp = r8a66597_read(r8a66597, SYSCFG0); + if (i++ > 500) { + printk(KERN_ERR "r8a66597: reg access fail.\n"); + return -ENXIO; + } + } while ((tmp & SCKE) != SCKE); + } return 0; } @@ -136,15 +136,16 @@ static void r8a66597_clock_disable(struct r8a66597 *r8a66597) { r8a66597_bclr(r8a66597, SCKE, SYSCFG0); udelay(1); -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) -#if defined(CONFIG_HAVE_CLK) - clk_disable(r8a66597->clk); -#endif -#else - r8a66597_bclr(r8a66597, PLLC, SYSCFG0); - r8a66597_bclr(r8a66597, XCKE, SYSCFG0); - r8a66597_bclr(r8a66597, USBE, SYSCFG0); + + if (r8a66597->pdata->on_chip) { +#ifdef CONFIG_HAVE_CLK + clk_disable(r8a66597->clk); #endif + } else { + r8a66597_bclr(r8a66597, PLLC, SYSCFG0); + r8a66597_bclr(r8a66597, XCKE, SYSCFG0); + r8a66597_bclr(r8a66597, USBE, SYSCFG0); + } } static void r8a66597_enable_port(struct r8a66597 *r8a66597, int port) @@ -205,7 +206,7 @@ static int enable_controller(struct r8a66597 *r8a66597) r8a66597_bset(r8a66597, SIGNE | SACKE, INTENB1); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) + for (port = 0; port < r8a66597->max_root_hub; port++) r8a66597_enable_port(r8a66597, port); return 0; @@ -218,7 +219,7 @@ static void disable_controller(struct r8a66597 *r8a66597) r8a66597_write(r8a66597, 0, INTENB0); r8a66597_write(r8a66597, 0, INTSTS0); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) + for (port = 0; port < r8a66597->max_root_hub; port++) r8a66597_disable_port(r8a66597, port); r8a66597_clock_disable(r8a66597); @@ -249,11 +250,12 @@ static int is_hub_limit(char *devpath) return ((strlen(devpath) >= 4) ? 1 : 0); } -static void get_port_number(char *devpath, u16 *root_port, u16 *hub_port) +static void get_port_number(struct r8a66597 *r8a66597, + char *devpath, u16 *root_port, u16 *hub_port) { if (root_port) { *root_port = (devpath[0] & 0x0F) - 1; - if (*root_port >= R8A66597_MAX_ROOT_HUB) + if (*root_port >= r8a66597->max_root_hub) printk(KERN_ERR "r8a66597: Illegal root port number.\n"); } if (hub_port) @@ -355,7 +357,8 @@ static int make_r8a66597_device(struct r8a66597 *r8a66597, INIT_LIST_HEAD(&dev->device_list); list_add_tail(&dev->device_list, &r8a66597->child_device); - get_port_number(urb->dev->devpath, &dev->root_port, &dev->hub_port); + get_port_number(r8a66597, urb->dev->devpath, + &dev->root_port, &dev->hub_port); if (!is_child_device(urb->dev->devpath)) r8a66597->root_hub[dev->root_port].dev = dev; @@ -420,7 +423,7 @@ static void free_usb_address(struct r8a66597 *r8a66597, list_del(&dev->device_list); kfree(dev); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) { + for (port = 0; port < r8a66597->max_root_hub; port++) { if (r8a66597->root_hub[port].dev == dev) { r8a66597->root_hub[port].dev = NULL; break; @@ -495,10 +498,20 @@ static void r8a66597_pipe_toggle(struct r8a66597 *r8a66597, r8a66597_bset(r8a66597, SQCLR, pipe->pipectr); } +static inline unsigned short mbw_value(struct r8a66597 *r8a66597) +{ + if (r8a66597->pdata->on_chip) + return MBW_32; + else + return MBW_16; +} + /* this function must be called with interrupt disabled */ static inline void cfifo_change(struct r8a66597 *r8a66597, u16 pipenum) { - r8a66597_mdfy(r8a66597, MBW | pipenum, MBW | CURPIPE, CFIFOSEL); + unsigned short mbw = mbw_value(r8a66597); + + r8a66597_mdfy(r8a66597, mbw | pipenum, mbw | CURPIPE, CFIFOSEL); r8a66597_reg_wait(r8a66597, CFIFOSEL, CURPIPE, pipenum); } @@ -506,11 +519,13 @@ static inline void cfifo_change(struct r8a66597 *r8a66597, u16 pipenum) static inline void fifo_change_from_pipe(struct r8a66597 *r8a66597, struct r8a66597_pipe *pipe) { + unsigned short mbw = mbw_value(r8a66597); + cfifo_change(r8a66597, 0); - r8a66597_mdfy(r8a66597, MBW | 0, MBW | CURPIPE, D0FIFOSEL); - r8a66597_mdfy(r8a66597, MBW | 0, MBW | CURPIPE, D1FIFOSEL); + r8a66597_mdfy(r8a66597, mbw | 0, mbw | CURPIPE, D0FIFOSEL); + r8a66597_mdfy(r8a66597, mbw | 0, mbw | CURPIPE, D1FIFOSEL); - r8a66597_mdfy(r8a66597, MBW | pipe->info.pipenum, MBW | CURPIPE, + r8a66597_mdfy(r8a66597, mbw | pipe->info.pipenum, mbw | CURPIPE, pipe->fifosel); r8a66597_reg_wait(r8a66597, pipe->fifosel, CURPIPE, pipe->info.pipenum); } @@ -742,9 +757,13 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597, struct r8a66597_pipe *pipe, struct urb *urb) { -#if !defined(CONFIG_SUPERH_ON_CHIP_R8A66597) int i; struct r8a66597_pipe_info *info = &pipe->info; + unsigned short mbw = mbw_value(r8a66597); + + /* pipe dma is only for external controlles */ + if (r8a66597->pdata->on_chip) + return; if ((pipe->info.pipenum != 0) && (info->type != R8A66597_INT)) { for (i = 0; i < R8A66597_MAX_DMA_CHANNEL; i++) { @@ -763,8 +782,8 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597, set_pipe_reg_addr(pipe, i); cfifo_change(r8a66597, 0); - r8a66597_mdfy(r8a66597, MBW | pipe->info.pipenum, - MBW | CURPIPE, pipe->fifosel); + r8a66597_mdfy(r8a66597, mbw | pipe->info.pipenum, + mbw | CURPIPE, pipe->fifosel); r8a66597_reg_wait(r8a66597, pipe->fifosel, CURPIPE, pipe->info.pipenum); @@ -772,7 +791,6 @@ static void enable_r8a66597_pipe_dma(struct r8a66597 *r8a66597, break; } } -#endif /* #if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) */ } /* this function must be called with interrupt disabled */ @@ -1769,7 +1787,7 @@ static void r8a66597_timer(unsigned long _r8a66597) spin_lock_irqsave(&r8a66597->lock, flags); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) + for (port = 0; port < r8a66597->max_root_hub; port++) r8a66597_root_hub_control(r8a66597, port); spin_unlock_irqrestore(&r8a66597->lock, flags); @@ -1807,7 +1825,7 @@ static void set_address_zero(struct r8a66597 *r8a66597, struct urb *urb) u16 root_port, hub_port; if (usb_address == 0) { - get_port_number(urb->dev->devpath, + get_port_number(r8a66597, urb->dev->devpath, &root_port, &hub_port); set_devadd_reg(r8a66597, 0, get_r8a66597_usb_speed(urb->dev->speed), @@ -2082,7 +2100,7 @@ static int r8a66597_hub_status_data(struct usb_hcd *hcd, char *buf) *buf = 0; /* initialize (no change) */ - for (i = 0; i < R8A66597_MAX_ROOT_HUB; i++) { + for (i = 0; i < r8a66597->max_root_hub; i++) { if (r8a66597->root_hub[i].port & 0xffff0000) *buf |= 1 << (i + 1); } @@ -2097,11 +2115,11 @@ static void r8a66597_hub_descriptor(struct r8a66597 *r8a66597, { desc->bDescriptorType = 0x29; desc->bHubContrCurrent = 0; - desc->bNbrPorts = R8A66597_MAX_ROOT_HUB; + desc->bNbrPorts = r8a66597->max_root_hub; desc->bDescLength = 9; desc->bPwrOn2PwrGood = 0; desc->wHubCharacteristics = cpu_to_le16(0x0011); - desc->bitmap[0] = ((1 << R8A66597_MAX_ROOT_HUB) - 1) << 1; + desc->bitmap[0] = ((1 << r8a66597->max_root_hub) - 1) << 1; desc->bitmap[1] = ~0; } @@ -2129,7 +2147,7 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, } break; case ClearPortFeature: - if (wIndex > R8A66597_MAX_ROOT_HUB) + if (wIndex > r8a66597->max_root_hub) goto error; if (wLength != 0) goto error; @@ -2162,12 +2180,12 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, *buf = 0x00; break; case GetPortStatus: - if (wIndex > R8A66597_MAX_ROOT_HUB) + if (wIndex > r8a66597->max_root_hub) goto error; *(__le32 *)buf = cpu_to_le32(rh->port); break; case SetPortFeature: - if (wIndex > R8A66597_MAX_ROOT_HUB) + if (wIndex > r8a66597->max_root_hub) goto error; if (wLength != 0) goto error; @@ -2216,7 +2234,7 @@ static int r8a66597_bus_suspend(struct usb_hcd *hcd) dbg("%s", __func__); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) { + for (port = 0; port < r8a66597->max_root_hub; port++) { struct r8a66597_root_hub *rh = &r8a66597->root_hub[port]; unsigned long dvstctr_reg = get_dvstctr_reg(port); @@ -2247,7 +2265,7 @@ static int r8a66597_bus_resume(struct usb_hcd *hcd) dbg("%s", __func__); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) { + for (port = 0; port < r8a66597->max_root_hub; port++) { struct r8a66597_root_hub *rh = &r8a66597->root_hub[port]; unsigned long dvstctr_reg = get_dvstctr_reg(port); @@ -2314,7 +2332,7 @@ static int r8a66597_suspend(struct device *dev) disable_controller(r8a66597); - for (port = 0; port < R8A66597_MAX_ROOT_HUB; port++) { + for (port = 0; port < r8a66597->max_root_hub; port++) { struct r8a66597_root_hub *rh = &r8a66597->root_hub[port]; rh->port = 0x00000000; @@ -2354,8 +2372,9 @@ static int __init_or_module r8a66597_remove(struct platform_device *pdev) del_timer_sync(&r8a66597->rh_timer); usb_remove_hcd(hcd); iounmap((void *)r8a66597->reg); -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) - clk_put(r8a66597->clk); +#ifdef CONFIG_HAVE_CLK + if (r8a66597->pdata->on_chip) + clk_put(r8a66597->clk); #endif usb_put_hcd(hcd); return 0; @@ -2363,7 +2382,7 @@ static int __init_or_module r8a66597_remove(struct platform_device *pdev) static int __devinit r8a66597_probe(struct platform_device *pdev) { -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK char clk_name[8]; #endif struct resource *res = NULL, *ires; @@ -2425,15 +2444,20 @@ static int __devinit r8a66597_probe(struct platform_device *pdev) r8a66597->pdata = pdev->dev.platform_data; r8a66597->irq_sense_low = irq_trigger == IRQF_TRIGGER_LOW; -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) - snprintf(clk_name, sizeof(clk_name), "usb%d", pdev->id); - r8a66597->clk = clk_get(&pdev->dev, clk_name); - if (IS_ERR(r8a66597->clk)) { - dev_err(&pdev->dev, "cannot get clock \"%s\"\n", clk_name); - ret = PTR_ERR(r8a66597->clk); - goto clean_up2; - } + if (r8a66597->pdata->on_chip) { +#ifdef CONFIG_HAVE_CLK + snprintf(clk_name, sizeof(clk_name), "usb%d", pdev->id); + r8a66597->clk = clk_get(&pdev->dev, clk_name); + if (IS_ERR(r8a66597->clk)) { + dev_err(&pdev->dev, "cannot get clock \"%s\"\n", + clk_name); + ret = PTR_ERR(r8a66597->clk); + goto clean_up2; + } #endif + r8a66597->max_root_hub = 1; + } else + r8a66597->max_root_hub = 2; spin_lock_init(&r8a66597->lock); init_timer(&r8a66597->rh_timer); @@ -2463,8 +2487,9 @@ static int __devinit r8a66597_probe(struct platform_device *pdev) return 0; clean_up3: -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) - clk_put(r8a66597->clk); +#ifdef CONFIG_HAVE_CLK + if (r8a66597->pdata->on_chip) + clk_put(r8a66597->clk); clean_up2: #endif usb_put_hcd(hcd); diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h index d72680b433f9..eecbd917bc81 100644 --- a/drivers/usb/host/r8a66597.h +++ b/drivers/usb/host/r8a66597.h @@ -26,7 +26,7 @@ #ifndef __R8A66597_H__ #define __R8A66597_H__ -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK #include #endif @@ -193,13 +193,9 @@ #define REW 0x4000 /* b14: Buffer rewind */ #define DCLRM 0x2000 /* b13: DMA buffer clear mode */ #define DREQE 0x1000 /* b12: DREQ output enable */ -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) -#define MBW 0x0800 -#else -#define MBW 0x0400 /* b10: Maximum bit width for FIFO access */ -#endif #define MBW_8 0x0000 /* 8bit */ #define MBW_16 0x0400 /* 16bit */ +#define MBW_32 0x0800 /* 32bit */ #define BIGEND 0x0100 /* b8: Big endian mode */ #define BYTE_LITTLE 0x0000 /* little dendian */ #define BYTE_BIG 0x0100 /* big endifan */ @@ -405,11 +401,7 @@ #define R8A66597_MAX_NUM_PIPE 10 #define R8A66597_BUF_BSIZE 8 #define R8A66597_MAX_DEVICE 10 -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) -#define R8A66597_MAX_ROOT_HUB 1 -#else #define R8A66597_MAX_ROOT_HUB 2 -#endif #define R8A66597_MAX_SAMPLING 5 #define R8A66597_RH_POLL_TIME 10 #define R8A66597_MAX_DMA_CHANNEL 2 @@ -487,7 +479,7 @@ struct r8a66597_root_hub { struct r8a66597 { spinlock_t lock; unsigned long reg; -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK struct clk *clk; #endif struct r8a66597_platdata *pdata; @@ -504,6 +496,7 @@ struct r8a66597 { unsigned short interval_map; unsigned char pipe_cnt[R8A66597_MAX_NUM_PIPE]; unsigned char dma_map; + unsigned int max_root_hub; struct list_head child_device; unsigned long child_connect_map[4]; @@ -550,21 +543,22 @@ static inline void r8a66597_read_fifo(struct r8a66597 *r8a66597, unsigned long offset, u16 *buf, int len) { -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) unsigned long fifoaddr = r8a66597->reg + offset; unsigned long count; - count = len / 4; - insl(fifoaddr, buf, count); + if (r8a66597->pdata->on_chip) { + count = len / 4; + insl(fifoaddr, buf, count); - if (len & 0x00000003) { - unsigned long tmp = inl(fifoaddr); - memcpy((unsigned char *)buf + count * 4, &tmp, len & 0x03); + if (len & 0x00000003) { + unsigned long tmp = inl(fifoaddr); + memcpy((unsigned char *)buf + count * 4, &tmp, + len & 0x03); + } + } else { + len = (len + 1) / 2; + insw(fifoaddr, buf, len); } -#else - len = (len + 1) / 2; - insw(r8a66597->reg + offset, buf, len); -#endif } static inline void r8a66597_write(struct r8a66597 *r8a66597, u16 val, @@ -578,33 +572,33 @@ static inline void r8a66597_write_fifo(struct r8a66597 *r8a66597, int len) { unsigned long fifoaddr = r8a66597->reg + offset; -#if defined(CONFIG_SUPERH_ON_CHIP_R8A66597) unsigned long count; unsigned char *pb; int i; - count = len / 4; - outsl(fifoaddr, buf, count); + if (r8a66597->pdata->on_chip) { + count = len / 4; + outsl(fifoaddr, buf, count); + + if (len & 0x00000003) { + pb = (unsigned char *)buf + count * 4; + for (i = 0; i < (len & 0x00000003); i++) { + if (r8a66597_read(r8a66597, CFIFOSEL) & BIGEND) + outb(pb[i], fifoaddr + i); + else + outb(pb[i], fifoaddr + 3 - i); + } + } + } else { + int odd = len & 0x0001; - if (len & 0x00000003) { - pb = (unsigned char *)buf + count * 4; - for (i = 0; i < (len & 0x00000003); i++) { - if (r8a66597_read(r8a66597, CFIFOSEL) & BIGEND) - outb(pb[i], fifoaddr + i); - else - outb(pb[i], fifoaddr + 3 - i); + len = len / 2; + outsw(fifoaddr, buf, len); + if (unlikely(odd)) { + buf = &buf[len]; + outb((unsigned char)*buf, fifoaddr); } } -#else - int odd = len & 0x0001; - - len = len / 2; - outsw(fifoaddr, buf, len); - if (unlikely(odd)) { - buf = &buf[len]; - outb((unsigned char)*buf, fifoaddr); - } -#endif } static inline void r8a66597_mdfy(struct r8a66597 *r8a66597, diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h index e9f0384fa20c..460ee3f6a2c6 100644 --- a/include/linux/usb/r8a66597.h +++ b/include/linux/usb/r8a66597.h @@ -31,6 +31,9 @@ struct r8a66597_platdata { /* This ops can controll port power instead of DVSTCTR register. */ void (*port_power)(int port, int power); + /* set one = on chip controller, set zero = external controller */ + unsigned on_chip:1; + /* (external controller only) set R8A66597_PLATDATA_XTAL_nnMHZ */ unsigned xtal:2; -- cgit v1.2.3-71-gd317 From 99fde513f57db2c8e1b202ade4be7d47033ff09b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 20 Jul 2009 22:28:50 -0700 Subject: Input: wm97xx - add possibility to control the GPIO_STATUS shift This patch allows tweaking the behaviour of GPIO_STATUS register shift quirk that's in wm97xx-core. The problem with GPIO_STATUS register being shifted by one doesn't appear on all hardware it seems and causes problems with accelerated touchscreen drivers on Palm hardware. Therefore an accelerated touchscreen driver can select if the shift is/isn't happening on the hardware. Signed-off-by: Marek Vasut Acked-by: Mark Brown Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/mainstone-wm97xx.c | 3 +++ drivers/input/touchscreen/wm97xx-core.c | 6 ++++-- include/linux/wm97xx.h | 7 +++++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/touchscreen/mainstone-wm97xx.c b/drivers/input/touchscreen/mainstone-wm97xx.c index c797bc04ee83..8fc3b08deb3b 100644 --- a/drivers/input/touchscreen/mainstone-wm97xx.c +++ b/drivers/input/touchscreen/mainstone-wm97xx.c @@ -198,6 +198,9 @@ static int wm97xx_acc_startup(struct wm97xx *wm) if (machine_is_palmt5() || machine_is_palmtx() || machine_is_palmld()) { pen_int = 1; irq = 27; + /* There is some obscure mutant of WM9712 interbred with WM9713 + * used on Palm HW */ + wm->variant = WM97xx_WM1613; } else if (machine_is_mainstone() && pen_int) irq = 4; diff --git a/drivers/input/touchscreen/wm97xx-core.c b/drivers/input/touchscreen/wm97xx-core.c index 2957d48e0045..252eb11fe9db 100644 --- a/drivers/input/touchscreen/wm97xx-core.c +++ b/drivers/input/touchscreen/wm97xx-core.c @@ -204,7 +204,7 @@ void wm97xx_set_gpio(struct wm97xx *wm, u32 gpio, else reg &= ~gpio; - if (wm->id == WM9712_ID2) + if (wm->id == WM9712_ID2 && wm->variant != WM97xx_WM1613) wm97xx_reg_write(wm, AC97_GPIO_STATUS, reg << 1); else wm97xx_reg_write(wm, AC97_GPIO_STATUS, reg); @@ -307,7 +307,7 @@ static void wm97xx_pen_irq_worker(struct work_struct *work) WM97XX_GPIO_13); } - if (wm->id == WM9712_ID2) + if (wm->id == WM9712_ID2 && wm->variant != WM97xx_WM1613) wm97xx_reg_write(wm, AC97_GPIO_STATUS, (status & ~WM97XX_GPIO_13) << 1); else @@ -582,6 +582,8 @@ static int wm97xx_probe(struct device *dev) wm->id = wm97xx_reg_read(wm, AC97_VENDOR_ID2); + wm->variant = WM97xx_GENERIC; + dev_info(wm->dev, "detected a wm97%02x codec\n", wm->id & 0xff); switch (wm->id & 0xff) { diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h index 6f69968eab24..0c9878123d5f 100644 --- a/include/linux/wm97xx.h +++ b/include/linux/wm97xx.h @@ -15,6 +15,12 @@ #include /* Input device layer */ #include +/* + * WM97xx variants + */ +#define WM97xx_GENERIC 0x0000 +#define WM97xx_WM1613 0x1613 + /* * WM97xx AC97 Touchscreen registers */ @@ -283,6 +289,7 @@ struct wm97xx { unsigned pen_is_down:1; /* Pen is down */ unsigned aux_waiting:1; /* aux measurement waiting */ unsigned pen_probably_down:1; /* used in polling mode */ + u16 variant; /* WM97xx chip variant */ u16 suspend_mode; /* PRP in suspend mode */ }; -- cgit v1.2.3-71-gd317 From d7aacaddcac3971e33cf52d7e610c06696cb347f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 8 Jul 2009 13:21:31 +0200 Subject: Driver Core: Add platform device arch data V3 Allow architecture specific data in struct platform_device V3. With this patch struct pdev_archdata is added to struct platform_device, similar to struct dev_archdata in found in struct device. Useful for architecture code that needs to keep extra data associated with each platform device. Struct pdev_archdata is different from dev.platform_data, the convention is that dev.platform_data points to driver-specific data. It may or may not be required by the driver. The format of this depends on driver but is the same across architectures. The structure pdev_archdata is a place for architecture specific data. This data is handled by architecture specific code (for example runtime PM), and since it is architecture specific it should _never_ be touched by device driver code. Exactly like struct dev_archdata but for platform devices. [rjw: This change is for power management mostly and that's why it goes through the suspend tree.] Signed-off-by: Magnus Damm Acked-by: Kevin Hilman Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- arch/arm/include/asm/device.h | 3 +++ arch/ia64/include/asm/device.h | 3 +++ arch/microblaze/include/asm/device.h | 3 +++ arch/powerpc/include/asm/device.h | 3 +++ arch/sparc/include/asm/device.h | 3 +++ arch/x86/include/asm/device.h | 3 +++ include/asm-generic/device.h | 3 +++ include/linux/platform_device.h | 3 +++ 8 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h index c61642b40603..9f390ce335cb 100644 --- a/arch/arm/include/asm/device.h +++ b/arch/arm/include/asm/device.h @@ -12,4 +12,7 @@ struct dev_archdata { #endif }; +struct pdev_archdata { +}; + #endif diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h index 41ab85d66f33..d66d446b127c 100644 --- a/arch/ia64/include/asm/device.h +++ b/arch/ia64/include/asm/device.h @@ -15,4 +15,7 @@ struct dev_archdata { #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_IA64_DEVICE_H */ diff --git a/arch/microblaze/include/asm/device.h b/arch/microblaze/include/asm/device.h index c042830793ed..30286db27c1c 100644 --- a/arch/microblaze/include/asm/device.h +++ b/arch/microblaze/include/asm/device.h @@ -16,6 +16,9 @@ struct dev_archdata { struct device_node *of_node; }; +struct pdev_archdata { +}; + #endif /* _ASM_MICROBLAZE_DEVICE_H */ diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 7d2277cef09a..e3e06e0f7fc0 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -30,4 +30,7 @@ dev_archdata_get_node(const struct dev_archdata *ad) return ad->of_node; } +struct pdev_archdata { +}; + #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h index 3702e087df2c..f3b85b6b0b76 100644 --- a/arch/sparc/include/asm/device.h +++ b/arch/sparc/include/asm/device.h @@ -32,4 +32,7 @@ dev_archdata_get_node(const struct dev_archdata *ad) return ad->prom_node; } +struct pdev_archdata { +}; + #endif /* _ASM_SPARC_DEVICE_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 4994a20acbcb..cee34e9ca45b 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops; #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_X86_DEVICE_H */ diff --git a/include/asm-generic/device.h b/include/asm-generic/device.h index c17c9600f220..d7c76bba640d 100644 --- a/include/asm-generic/device.h +++ b/include/asm-generic/device.h @@ -9,4 +9,7 @@ struct dev_archdata { }; +struct pdev_archdata { +}; + #endif /* _ASM_GENERIC_DEVICE_H */ diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 8dc5123b6305..672a69849735 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -22,6 +22,9 @@ struct platform_device { struct resource * resource; struct platform_device_id *id_entry; + + /* arch specific additions */ + struct pdev_archdata archdata; }; #define platform_get_device_id(pdev) ((pdev)->id_entry) -- cgit v1.2.3-71-gd317 From 511647ff58fd0f1c1f415d2c757d841650edac91 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 8 Jul 2009 13:23:07 +0200 Subject: PM: Remove platform device suspend_late()/resume_early() V2 This is V2 of the platform driver power management late/early callback removal patch. The callbacks ->suspend_late() and ->resume_early() are removed since all in-tree users now have been migrated to dev_pm_ops. Signed-off-by: Magnus Damm Acked-by: Greg Kroah-Hartman Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 36 ------------------------------------ include/linux/platform_device.h | 2 -- 2 files changed, 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 81cb01bfc356..455e55971d0e 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -628,30 +628,6 @@ static int platform_legacy_suspend(struct device *dev, pm_message_t mesg) return ret; } -static int platform_legacy_suspend_late(struct device *dev, pm_message_t mesg) -{ - struct platform_driver *pdrv = to_platform_driver(dev->driver); - struct platform_device *pdev = to_platform_device(dev); - int ret = 0; - - if (dev->driver && pdrv->suspend_late) - ret = pdrv->suspend_late(pdev, mesg); - - return ret; -} - -static int platform_legacy_resume_early(struct device *dev) -{ - struct platform_driver *pdrv = to_platform_driver(dev->driver); - struct platform_device *pdev = to_platform_device(dev); - int ret = 0; - - if (dev->driver && pdrv->resume_early) - ret = pdrv->resume_early(pdev); - - return ret; -} - static int platform_legacy_resume(struct device *dev) { struct platform_driver *pdrv = to_platform_driver(dev->driver); @@ -714,8 +690,6 @@ static int platform_pm_suspend_noirq(struct device *dev) if (drv->pm) { if (drv->pm->suspend_noirq) ret = drv->pm->suspend_noirq(dev); - } else { - ret = platform_legacy_suspend_late(dev, PMSG_SUSPEND); } return ret; @@ -750,8 +724,6 @@ static int platform_pm_resume_noirq(struct device *dev) if (drv->pm) { if (drv->pm->resume_noirq) ret = drv->pm->resume_noirq(dev); - } else { - ret = platform_legacy_resume_early(dev); } return ret; @@ -797,8 +769,6 @@ static int platform_pm_freeze_noirq(struct device *dev) if (drv->pm) { if (drv->pm->freeze_noirq) ret = drv->pm->freeze_noirq(dev); - } else { - ret = platform_legacy_suspend_late(dev, PMSG_FREEZE); } return ret; @@ -833,8 +803,6 @@ static int platform_pm_thaw_noirq(struct device *dev) if (drv->pm) { if (drv->pm->thaw_noirq) ret = drv->pm->thaw_noirq(dev); - } else { - ret = platform_legacy_resume_early(dev); } return ret; @@ -869,8 +837,6 @@ static int platform_pm_poweroff_noirq(struct device *dev) if (drv->pm) { if (drv->pm->poweroff_noirq) ret = drv->pm->poweroff_noirq(dev); - } else { - ret = platform_legacy_suspend_late(dev, PMSG_HIBERNATE); } return ret; @@ -905,8 +871,6 @@ static int platform_pm_restore_noirq(struct device *dev) if (drv->pm) { if (drv->pm->restore_noirq) ret = drv->pm->restore_noirq(dev); - } else { - ret = platform_legacy_resume_early(dev); } return ret; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 672a69849735..3c6675c2444b 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -60,8 +60,6 @@ struct platform_driver { int (*remove)(struct platform_device *); void (*shutdown)(struct platform_device *); int (*suspend)(struct platform_device *, pm_message_t state); - int (*suspend_late)(struct platform_device *, pm_message_t state); - int (*resume_early)(struct platform_device *); int (*resume)(struct platform_device *); struct device_driver driver; struct platform_device_id *id_table; -- cgit v1.2.3-71-gd317 From fbd90375d7531927d312766b548376d909811b4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 13:40:14 +0200 Subject: hrtimer: Remove cb_entry from struct hrtimer It's unused, remove it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: --- include/linux/hrtimer.h | 2 -- kernel/hrtimer.c | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 54648e625efd..40e7d54fc424 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -91,7 +91,6 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started * @start_comm: timer statistics field to store the name of the process which @@ -108,7 +107,6 @@ struct hrtimer { enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; - struct list_head cb_entry; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 43d151f185b6..052a0f53e4eb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1092,7 +1092,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3-71-gd317 From cf4f1e76c49dacfde0680b170b9a9b6a42f296bb Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 22 Jul 2009 14:32:03 +0000 Subject: usb: move r8a66597 register defines Move r8a66597 hardware register definitions from the host controller header file to the platform data header file. With this change in place we can easily share register definitions between the host controller driver and a future gadget driver. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/usb/host/r8a66597.h | 366 ------------------------------------------ include/linux/usb/r8a66597.h | 372 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 370 insertions(+), 368 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/host/r8a66597.h b/drivers/usb/host/r8a66597.h index eecbd917bc81..228e3fb23854 100644 --- a/drivers/usb/host/r8a66597.h +++ b/drivers/usb/host/r8a66597.h @@ -32,372 +32,6 @@ #include -#define SYSCFG0 0x00 -#define SYSCFG1 0x02 -#define SYSSTS0 0x04 -#define SYSSTS1 0x06 -#define DVSTCTR0 0x08 -#define DVSTCTR1 0x0A -#define TESTMODE 0x0C -#define PINCFG 0x0E -#define DMA0CFG 0x10 -#define DMA1CFG 0x12 -#define CFIFO 0x14 -#define D0FIFO 0x18 -#define D1FIFO 0x1C -#define CFIFOSEL 0x20 -#define CFIFOCTR 0x22 -#define CFIFOSIE 0x24 -#define D0FIFOSEL 0x28 -#define D0FIFOCTR 0x2A -#define D1FIFOSEL 0x2C -#define D1FIFOCTR 0x2E -#define INTENB0 0x30 -#define INTENB1 0x32 -#define INTENB2 0x34 -#define BRDYENB 0x36 -#define NRDYENB 0x38 -#define BEMPENB 0x3A -#define SOFCFG 0x3C -#define INTSTS0 0x40 -#define INTSTS1 0x42 -#define INTSTS2 0x44 -#define BRDYSTS 0x46 -#define NRDYSTS 0x48 -#define BEMPSTS 0x4A -#define FRMNUM 0x4C -#define UFRMNUM 0x4E -#define USBADDR 0x50 -#define USBREQ 0x54 -#define USBVAL 0x56 -#define USBINDX 0x58 -#define USBLENG 0x5A -#define DCPCFG 0x5C -#define DCPMAXP 0x5E -#define DCPCTR 0x60 -#define PIPESEL 0x64 -#define PIPECFG 0x68 -#define PIPEBUF 0x6A -#define PIPEMAXP 0x6C -#define PIPEPERI 0x6E -#define PIPE1CTR 0x70 -#define PIPE2CTR 0x72 -#define PIPE3CTR 0x74 -#define PIPE4CTR 0x76 -#define PIPE5CTR 0x78 -#define PIPE6CTR 0x7A -#define PIPE7CTR 0x7C -#define PIPE8CTR 0x7E -#define PIPE9CTR 0x80 -#define PIPE1TRE 0x90 -#define PIPE1TRN 0x92 -#define PIPE2TRE 0x94 -#define PIPE2TRN 0x96 -#define PIPE3TRE 0x98 -#define PIPE3TRN 0x9A -#define PIPE4TRE 0x9C -#define PIPE4TRN 0x9E -#define PIPE5TRE 0xA0 -#define PIPE5TRN 0xA2 -#define DEVADD0 0xD0 -#define DEVADD1 0xD2 -#define DEVADD2 0xD4 -#define DEVADD3 0xD6 -#define DEVADD4 0xD8 -#define DEVADD5 0xDA -#define DEVADD6 0xDC -#define DEVADD7 0xDE -#define DEVADD8 0xE0 -#define DEVADD9 0xE2 -#define DEVADDA 0xE4 - -/* System Configuration Control Register */ -#define XTAL 0xC000 /* b15-14: Crystal selection */ -#define XTAL48 0x8000 /* 48MHz */ -#define XTAL24 0x4000 /* 24MHz */ -#define XTAL12 0x0000 /* 12MHz */ -#define XCKE 0x2000 /* b13: External clock enable */ -#define PLLC 0x0800 /* b11: PLL control */ -#define SCKE 0x0400 /* b10: USB clock enable */ -#define PCSDIS 0x0200 /* b9: not CS wakeup */ -#define LPSME 0x0100 /* b8: Low power sleep mode */ -#define HSE 0x0080 /* b7: Hi-speed enable */ -#define DCFM 0x0040 /* b6: Controller function select */ -#define DRPD 0x0020 /* b5: D+/- pull down control */ -#define DPRPU 0x0010 /* b4: D+ pull up control */ -#define USBE 0x0001 /* b0: USB module operation enable */ - -/* System Configuration Status Register */ -#define OVCBIT 0x8000 /* b15-14: Over-current bit */ -#define OVCMON 0xC000 /* b15-14: Over-current monitor */ -#define SOFEA 0x0020 /* b5: SOF monitor */ -#define IDMON 0x0004 /* b3: ID-pin monitor */ -#define LNST 0x0003 /* b1-0: D+, D- line status */ -#define SE1 0x0003 /* SE1 */ -#define FS_KSTS 0x0002 /* Full-Speed K State */ -#define FS_JSTS 0x0001 /* Full-Speed J State */ -#define LS_JSTS 0x0002 /* Low-Speed J State */ -#define LS_KSTS 0x0001 /* Low-Speed K State */ -#define SE0 0x0000 /* SE0 */ - -/* Device State Control Register */ -#define EXTLP0 0x0400 /* b10: External port */ -#define VBOUT 0x0200 /* b9: VBUS output */ -#define WKUP 0x0100 /* b8: Remote wakeup */ -#define RWUPE 0x0080 /* b7: Remote wakeup sense */ -#define USBRST 0x0040 /* b6: USB reset enable */ -#define RESUME 0x0020 /* b5: Resume enable */ -#define UACT 0x0010 /* b4: USB bus enable */ -#define RHST 0x0007 /* b1-0: Reset handshake status */ -#define HSPROC 0x0004 /* HS handshake is processing */ -#define HSMODE 0x0003 /* Hi-Speed mode */ -#define FSMODE 0x0002 /* Full-Speed mode */ -#define LSMODE 0x0001 /* Low-Speed mode */ -#define UNDECID 0x0000 /* Undecided */ - -/* Test Mode Register */ -#define UTST 0x000F /* b3-0: Test select */ -#define H_TST_PACKET 0x000C /* HOST TEST Packet */ -#define H_TST_SE0_NAK 0x000B /* HOST TEST SE0 NAK */ -#define H_TST_K 0x000A /* HOST TEST K */ -#define H_TST_J 0x0009 /* HOST TEST J */ -#define H_TST_NORMAL 0x0000 /* HOST Normal Mode */ -#define P_TST_PACKET 0x0004 /* PERI TEST Packet */ -#define P_TST_SE0_NAK 0x0003 /* PERI TEST SE0 NAK */ -#define P_TST_K 0x0002 /* PERI TEST K */ -#define P_TST_J 0x0001 /* PERI TEST J */ -#define P_TST_NORMAL 0x0000 /* PERI Normal Mode */ - -/* Data Pin Configuration Register */ -#define LDRV 0x8000 /* b15: Drive Current Adjust */ -#define VIF1 0x0000 /* VIF = 1.8V */ -#define VIF3 0x8000 /* VIF = 3.3V */ -#define INTA 0x0001 /* b1: USB INT-pin active */ - -/* DMAx Pin Configuration Register */ -#define DREQA 0x4000 /* b14: Dreq active select */ -#define BURST 0x2000 /* b13: Burst mode */ -#define DACKA 0x0400 /* b10: Dack active select */ -#define DFORM 0x0380 /* b9-7: DMA mode select */ -#define CPU_ADR_RD_WR 0x0000 /* Address + RD/WR mode (CPU bus) */ -#define CPU_DACK_RD_WR 0x0100 /* DACK + RD/WR mode (CPU bus) */ -#define CPU_DACK_ONLY 0x0180 /* DACK only mode (CPU bus) */ -#define SPLIT_DACK_ONLY 0x0200 /* DACK only mode (SPLIT bus) */ -#define DENDA 0x0040 /* b6: Dend active select */ -#define PKTM 0x0020 /* b5: Packet mode */ -#define DENDE 0x0010 /* b4: Dend enable */ -#define OBUS 0x0004 /* b2: OUTbus mode */ - -/* CFIFO/DxFIFO Port Select Register */ -#define RCNT 0x8000 /* b15: Read count mode */ -#define REW 0x4000 /* b14: Buffer rewind */ -#define DCLRM 0x2000 /* b13: DMA buffer clear mode */ -#define DREQE 0x1000 /* b12: DREQ output enable */ -#define MBW_8 0x0000 /* 8bit */ -#define MBW_16 0x0400 /* 16bit */ -#define MBW_32 0x0800 /* 32bit */ -#define BIGEND 0x0100 /* b8: Big endian mode */ -#define BYTE_LITTLE 0x0000 /* little dendian */ -#define BYTE_BIG 0x0100 /* big endifan */ -#define ISEL 0x0020 /* b5: DCP FIFO port direction select */ -#define CURPIPE 0x000F /* b2-0: PIPE select */ - -/* CFIFO/DxFIFO Port Control Register */ -#define BVAL 0x8000 /* b15: Buffer valid flag */ -#define BCLR 0x4000 /* b14: Buffer clear */ -#define FRDY 0x2000 /* b13: FIFO ready */ -#define DTLN 0x0FFF /* b11-0: FIFO received data length */ - -/* Interrupt Enable Register 0 */ -#define VBSE 0x8000 /* b15: VBUS interrupt */ -#define RSME 0x4000 /* b14: Resume interrupt */ -#define SOFE 0x2000 /* b13: Frame update interrupt */ -#define DVSE 0x1000 /* b12: Device state transition interrupt */ -#define CTRE 0x0800 /* b11: Control transfer stage transition interrupt */ -#define BEMPE 0x0400 /* b10: Buffer empty interrupt */ -#define NRDYE 0x0200 /* b9: Buffer not ready interrupt */ -#define BRDYE 0x0100 /* b8: Buffer ready interrupt */ - -/* Interrupt Enable Register 1 */ -#define OVRCRE 0x8000 /* b15: Over-current interrupt */ -#define BCHGE 0x4000 /* b14: USB us chenge interrupt */ -#define DTCHE 0x1000 /* b12: Detach sense interrupt */ -#define ATTCHE 0x0800 /* b11: Attach sense interrupt */ -#define EOFERRE 0x0040 /* b6: EOF error interrupt */ -#define SIGNE 0x0020 /* b5: SETUP IGNORE interrupt */ -#define SACKE 0x0010 /* b4: SETUP ACK interrupt */ - -/* BRDY Interrupt Enable/Status Register */ -#define BRDY9 0x0200 /* b9: PIPE9 */ -#define BRDY8 0x0100 /* b8: PIPE8 */ -#define BRDY7 0x0080 /* b7: PIPE7 */ -#define BRDY6 0x0040 /* b6: PIPE6 */ -#define BRDY5 0x0020 /* b5: PIPE5 */ -#define BRDY4 0x0010 /* b4: PIPE4 */ -#define BRDY3 0x0008 /* b3: PIPE3 */ -#define BRDY2 0x0004 /* b2: PIPE2 */ -#define BRDY1 0x0002 /* b1: PIPE1 */ -#define BRDY0 0x0001 /* b1: PIPE0 */ - -/* NRDY Interrupt Enable/Status Register */ -#define NRDY9 0x0200 /* b9: PIPE9 */ -#define NRDY8 0x0100 /* b8: PIPE8 */ -#define NRDY7 0x0080 /* b7: PIPE7 */ -#define NRDY6 0x0040 /* b6: PIPE6 */ -#define NRDY5 0x0020 /* b5: PIPE5 */ -#define NRDY4 0x0010 /* b4: PIPE4 */ -#define NRDY3 0x0008 /* b3: PIPE3 */ -#define NRDY2 0x0004 /* b2: PIPE2 */ -#define NRDY1 0x0002 /* b1: PIPE1 */ -#define NRDY0 0x0001 /* b1: PIPE0 */ - -/* BEMP Interrupt Enable/Status Register */ -#define BEMP9 0x0200 /* b9: PIPE9 */ -#define BEMP8 0x0100 /* b8: PIPE8 */ -#define BEMP7 0x0080 /* b7: PIPE7 */ -#define BEMP6 0x0040 /* b6: PIPE6 */ -#define BEMP5 0x0020 /* b5: PIPE5 */ -#define BEMP4 0x0010 /* b4: PIPE4 */ -#define BEMP3 0x0008 /* b3: PIPE3 */ -#define BEMP2 0x0004 /* b2: PIPE2 */ -#define BEMP1 0x0002 /* b1: PIPE1 */ -#define BEMP0 0x0001 /* b0: PIPE0 */ - -/* SOF Pin Configuration Register */ -#define TRNENSEL 0x0100 /* b8: Select transaction enable period */ -#define BRDYM 0x0040 /* b6: BRDY clear timing */ -#define INTL 0x0020 /* b5: Interrupt sense select */ -#define EDGESTS 0x0010 /* b4: */ -#define SOFMODE 0x000C /* b3-2: SOF pin select */ -#define SOF_125US 0x0008 /* SOF OUT 125us Frame Signal */ -#define SOF_1MS 0x0004 /* SOF OUT 1ms Frame Signal */ -#define SOF_DISABLE 0x0000 /* SOF OUT Disable */ - -/* Interrupt Status Register 0 */ -#define VBINT 0x8000 /* b15: VBUS interrupt */ -#define RESM 0x4000 /* b14: Resume interrupt */ -#define SOFR 0x2000 /* b13: SOF frame update interrupt */ -#define DVST 0x1000 /* b12: Device state transition interrupt */ -#define CTRT 0x0800 /* b11: Control transfer stage transition interrupt */ -#define BEMP 0x0400 /* b10: Buffer empty interrupt */ -#define NRDY 0x0200 /* b9: Buffer not ready interrupt */ -#define BRDY 0x0100 /* b8: Buffer ready interrupt */ -#define VBSTS 0x0080 /* b7: VBUS input port */ -#define DVSQ 0x0070 /* b6-4: Device state */ -#define DS_SPD_CNFG 0x0070 /* Suspend Configured */ -#define DS_SPD_ADDR 0x0060 /* Suspend Address */ -#define DS_SPD_DFLT 0x0050 /* Suspend Default */ -#define DS_SPD_POWR 0x0040 /* Suspend Powered */ -#define DS_SUSP 0x0040 /* Suspend */ -#define DS_CNFG 0x0030 /* Configured */ -#define DS_ADDS 0x0020 /* Address */ -#define DS_DFLT 0x0010 /* Default */ -#define DS_POWR 0x0000 /* Powered */ -#define DVSQS 0x0030 /* b5-4: Device state */ -#define VALID 0x0008 /* b3: Setup packet detected flag */ -#define CTSQ 0x0007 /* b2-0: Control transfer stage */ -#define CS_SQER 0x0006 /* Sequence error */ -#define CS_WRND 0x0005 /* Control write nodata status stage */ -#define CS_WRSS 0x0004 /* Control write status stage */ -#define CS_WRDS 0x0003 /* Control write data stage */ -#define CS_RDSS 0x0002 /* Control read status stage */ -#define CS_RDDS 0x0001 /* Control read data stage */ -#define CS_IDST 0x0000 /* Idle or setup stage */ - -/* Interrupt Status Register 1 */ -#define OVRCR 0x8000 /* b15: Over-current interrupt */ -#define BCHG 0x4000 /* b14: USB bus chenge interrupt */ -#define DTCH 0x1000 /* b12: Detach sense interrupt */ -#define ATTCH 0x0800 /* b11: Attach sense interrupt */ -#define EOFERR 0x0040 /* b6: EOF-error interrupt */ -#define SIGN 0x0020 /* b5: Setup ignore interrupt */ -#define SACK 0x0010 /* b4: Setup acknowledge interrupt */ - -/* Frame Number Register */ -#define OVRN 0x8000 /* b15: Overrun error */ -#define CRCE 0x4000 /* b14: Received data error */ -#define FRNM 0x07FF /* b10-0: Frame number */ - -/* Micro Frame Number Register */ -#define UFRNM 0x0007 /* b2-0: Micro frame number */ - -/* Default Control Pipe Maxpacket Size Register */ -/* Pipe Maxpacket Size Register */ -#define DEVSEL 0xF000 /* b15-14: Device address select */ -#define MAXP 0x007F /* b6-0: Maxpacket size of default control pipe */ - -/* Default Control Pipe Control Register */ -#define BSTS 0x8000 /* b15: Buffer status */ -#define SUREQ 0x4000 /* b14: Send USB request */ -#define CSCLR 0x2000 /* b13: complete-split status clear */ -#define CSSTS 0x1000 /* b12: complete-split status */ -#define SUREQCLR 0x0800 /* b11: stop setup request */ -#define SQCLR 0x0100 /* b8: Sequence toggle bit clear */ -#define SQSET 0x0080 /* b7: Sequence toggle bit set */ -#define SQMON 0x0040 /* b6: Sequence toggle bit monitor */ -#define PBUSY 0x0020 /* b5: pipe busy */ -#define PINGE 0x0010 /* b4: ping enable */ -#define CCPL 0x0004 /* b2: Enable control transfer complete */ -#define PID 0x0003 /* b1-0: Response PID */ -#define PID_STALL11 0x0003 /* STALL */ -#define PID_STALL 0x0002 /* STALL */ -#define PID_BUF 0x0001 /* BUF */ -#define PID_NAK 0x0000 /* NAK */ - -/* Pipe Window Select Register */ -#define PIPENM 0x0007 /* b2-0: Pipe select */ - -/* Pipe Configuration Register */ -#define R8A66597_TYP 0xC000 /* b15-14: Transfer type */ -#define R8A66597_ISO 0xC000 /* Isochronous */ -#define R8A66597_INT 0x8000 /* Interrupt */ -#define R8A66597_BULK 0x4000 /* Bulk */ -#define R8A66597_BFRE 0x0400 /* b10: Buffer ready interrupt mode select */ -#define R8A66597_DBLB 0x0200 /* b9: Double buffer mode select */ -#define R8A66597_CNTMD 0x0100 /* b8: Continuous transfer mode select */ -#define R8A66597_SHTNAK 0x0080 /* b7: Transfer end NAK */ -#define R8A66597_DIR 0x0010 /* b4: Transfer direction select */ -#define R8A66597_EPNUM 0x000F /* b3-0: Eendpoint number select */ - -/* Pipe Buffer Configuration Register */ -#define BUFSIZE 0x7C00 /* b14-10: Pipe buffer size */ -#define BUFNMB 0x007F /* b6-0: Pipe buffer number */ -#define PIPE0BUF 256 -#define PIPExBUF 64 - -/* Pipe Maxpacket Size Register */ -#define MXPS 0x07FF /* b10-0: Maxpacket size */ - -/* Pipe Cycle Configuration Register */ -#define IFIS 0x1000 /* b12: Isochronous in-buffer flush mode select */ -#define IITV 0x0007 /* b2-0: Isochronous interval */ - -/* Pipex Control Register */ -#define BSTS 0x8000 /* b15: Buffer status */ -#define INBUFM 0x4000 /* b14: IN buffer monitor (Only for PIPE1 to 5) */ -#define CSCLR 0x2000 /* b13: complete-split status clear */ -#define CSSTS 0x1000 /* b12: complete-split status */ -#define ATREPM 0x0400 /* b10: Auto repeat mode */ -#define ACLRM 0x0200 /* b9: Out buffer auto clear mode */ -#define SQCLR 0x0100 /* b8: Sequence toggle bit clear */ -#define SQSET 0x0080 /* b7: Sequence toggle bit set */ -#define SQMON 0x0040 /* b6: Sequence toggle bit monitor */ -#define PBUSY 0x0020 /* b5: pipe busy */ -#define PID 0x0003 /* b1-0: Response PID */ - -/* PIPExTRE */ -#define TRENB 0x0200 /* b9: Transaction counter enable */ -#define TRCLR 0x0100 /* b8: Transaction counter clear */ - -/* PIPExTRN */ -#define TRNCNT 0xFFFF /* b15-0: Transaction counter */ - -/* DEVADDx */ -#define UPPHUB 0x7800 -#define HUBPORT 0x0700 -#define USBSPD 0x00C0 -#define RTPORT 0x0001 - #define R8A66597_MAX_NUM_PIPE 10 #define R8A66597_BUF_BSIZE 8 #define R8A66597_MAX_DEVICE 10 diff --git a/include/linux/usb/r8a66597.h b/include/linux/usb/r8a66597.h index 460ee3f6a2c6..26d216734057 100644 --- a/include/linux/usb/r8a66597.h +++ b/include/linux/usb/r8a66597.h @@ -28,7 +28,7 @@ #define R8A66597_PLATDATA_XTAL_48MHZ 0x03 struct r8a66597_platdata { - /* This ops can controll port power instead of DVSTCTR register. */ + /* This callback can control port power instead of DVSTCTR register. */ void (*port_power)(int port, int power); /* set one = on chip controller, set zero = external controller */ @@ -43,5 +43,373 @@ struct r8a66597_platdata { /* set one = big endian, set zero = little endian */ unsigned endian:1; }; -#endif + +/* Register definitions */ +#define SYSCFG0 0x00 +#define SYSCFG1 0x02 +#define SYSSTS0 0x04 +#define SYSSTS1 0x06 +#define DVSTCTR0 0x08 +#define DVSTCTR1 0x0A +#define TESTMODE 0x0C +#define PINCFG 0x0E +#define DMA0CFG 0x10 +#define DMA1CFG 0x12 +#define CFIFO 0x14 +#define D0FIFO 0x18 +#define D1FIFO 0x1C +#define CFIFOSEL 0x20 +#define CFIFOCTR 0x22 +#define CFIFOSIE 0x24 +#define D0FIFOSEL 0x28 +#define D0FIFOCTR 0x2A +#define D1FIFOSEL 0x2C +#define D1FIFOCTR 0x2E +#define INTENB0 0x30 +#define INTENB1 0x32 +#define INTENB2 0x34 +#define BRDYENB 0x36 +#define NRDYENB 0x38 +#define BEMPENB 0x3A +#define SOFCFG 0x3C +#define INTSTS0 0x40 +#define INTSTS1 0x42 +#define INTSTS2 0x44 +#define BRDYSTS 0x46 +#define NRDYSTS 0x48 +#define BEMPSTS 0x4A +#define FRMNUM 0x4C +#define UFRMNUM 0x4E +#define USBADDR 0x50 +#define USBREQ 0x54 +#define USBVAL 0x56 +#define USBINDX 0x58 +#define USBLENG 0x5A +#define DCPCFG 0x5C +#define DCPMAXP 0x5E +#define DCPCTR 0x60 +#define PIPESEL 0x64 +#define PIPECFG 0x68 +#define PIPEBUF 0x6A +#define PIPEMAXP 0x6C +#define PIPEPERI 0x6E +#define PIPE1CTR 0x70 +#define PIPE2CTR 0x72 +#define PIPE3CTR 0x74 +#define PIPE4CTR 0x76 +#define PIPE5CTR 0x78 +#define PIPE6CTR 0x7A +#define PIPE7CTR 0x7C +#define PIPE8CTR 0x7E +#define PIPE9CTR 0x80 +#define PIPE1TRE 0x90 +#define PIPE1TRN 0x92 +#define PIPE2TRE 0x94 +#define PIPE2TRN 0x96 +#define PIPE3TRE 0x98 +#define PIPE3TRN 0x9A +#define PIPE4TRE 0x9C +#define PIPE4TRN 0x9E +#define PIPE5TRE 0xA0 +#define PIPE5TRN 0xA2 +#define DEVADD0 0xD0 +#define DEVADD1 0xD2 +#define DEVADD2 0xD4 +#define DEVADD3 0xD6 +#define DEVADD4 0xD8 +#define DEVADD5 0xDA +#define DEVADD6 0xDC +#define DEVADD7 0xDE +#define DEVADD8 0xE0 +#define DEVADD9 0xE2 +#define DEVADDA 0xE4 + +/* System Configuration Control Register */ +#define XTAL 0xC000 /* b15-14: Crystal selection */ +#define XTAL48 0x8000 /* 48MHz */ +#define XTAL24 0x4000 /* 24MHz */ +#define XTAL12 0x0000 /* 12MHz */ +#define XCKE 0x2000 /* b13: External clock enable */ +#define PLLC 0x0800 /* b11: PLL control */ +#define SCKE 0x0400 /* b10: USB clock enable */ +#define PCSDIS 0x0200 /* b9: not CS wakeup */ +#define LPSME 0x0100 /* b8: Low power sleep mode */ +#define HSE 0x0080 /* b7: Hi-speed enable */ +#define DCFM 0x0040 /* b6: Controller function select */ +#define DRPD 0x0020 /* b5: D+/- pull down control */ +#define DPRPU 0x0010 /* b4: D+ pull up control */ +#define USBE 0x0001 /* b0: USB module operation enable */ + +/* System Configuration Status Register */ +#define OVCBIT 0x8000 /* b15-14: Over-current bit */ +#define OVCMON 0xC000 /* b15-14: Over-current monitor */ +#define SOFEA 0x0020 /* b5: SOF monitor */ +#define IDMON 0x0004 /* b3: ID-pin monitor */ +#define LNST 0x0003 /* b1-0: D+, D- line status */ +#define SE1 0x0003 /* SE1 */ +#define FS_KSTS 0x0002 /* Full-Speed K State */ +#define FS_JSTS 0x0001 /* Full-Speed J State */ +#define LS_JSTS 0x0002 /* Low-Speed J State */ +#define LS_KSTS 0x0001 /* Low-Speed K State */ +#define SE0 0x0000 /* SE0 */ + +/* Device State Control Register */ +#define EXTLP0 0x0400 /* b10: External port */ +#define VBOUT 0x0200 /* b9: VBUS output */ +#define WKUP 0x0100 /* b8: Remote wakeup */ +#define RWUPE 0x0080 /* b7: Remote wakeup sense */ +#define USBRST 0x0040 /* b6: USB reset enable */ +#define RESUME 0x0020 /* b5: Resume enable */ +#define UACT 0x0010 /* b4: USB bus enable */ +#define RHST 0x0007 /* b1-0: Reset handshake status */ +#define HSPROC 0x0004 /* HS handshake is processing */ +#define HSMODE 0x0003 /* Hi-Speed mode */ +#define FSMODE 0x0002 /* Full-Speed mode */ +#define LSMODE 0x0001 /* Low-Speed mode */ +#define UNDECID 0x0000 /* Undecided */ + +/* Test Mode Register */ +#define UTST 0x000F /* b3-0: Test select */ +#define H_TST_PACKET 0x000C /* HOST TEST Packet */ +#define H_TST_SE0_NAK 0x000B /* HOST TEST SE0 NAK */ +#define H_TST_K 0x000A /* HOST TEST K */ +#define H_TST_J 0x0009 /* HOST TEST J */ +#define H_TST_NORMAL 0x0000 /* HOST Normal Mode */ +#define P_TST_PACKET 0x0004 /* PERI TEST Packet */ +#define P_TST_SE0_NAK 0x0003 /* PERI TEST SE0 NAK */ +#define P_TST_K 0x0002 /* PERI TEST K */ +#define P_TST_J 0x0001 /* PERI TEST J */ +#define P_TST_NORMAL 0x0000 /* PERI Normal Mode */ + +/* Data Pin Configuration Register */ +#define LDRV 0x8000 /* b15: Drive Current Adjust */ +#define VIF1 0x0000 /* VIF = 1.8V */ +#define VIF3 0x8000 /* VIF = 3.3V */ +#define INTA 0x0001 /* b1: USB INT-pin active */ + +/* DMAx Pin Configuration Register */ +#define DREQA 0x4000 /* b14: Dreq active select */ +#define BURST 0x2000 /* b13: Burst mode */ +#define DACKA 0x0400 /* b10: Dack active select */ +#define DFORM 0x0380 /* b9-7: DMA mode select */ +#define CPU_ADR_RD_WR 0x0000 /* Address + RD/WR mode (CPU bus) */ +#define CPU_DACK_RD_WR 0x0100 /* DACK + RD/WR mode (CPU bus) */ +#define CPU_DACK_ONLY 0x0180 /* DACK only mode (CPU bus) */ +#define SPLIT_DACK_ONLY 0x0200 /* DACK only mode (SPLIT bus) */ +#define DENDA 0x0040 /* b6: Dend active select */ +#define PKTM 0x0020 /* b5: Packet mode */ +#define DENDE 0x0010 /* b4: Dend enable */ +#define OBUS 0x0004 /* b2: OUTbus mode */ + +/* CFIFO/DxFIFO Port Select Register */ +#define RCNT 0x8000 /* b15: Read count mode */ +#define REW 0x4000 /* b14: Buffer rewind */ +#define DCLRM 0x2000 /* b13: DMA buffer clear mode */ +#define DREQE 0x1000 /* b12: DREQ output enable */ +#define MBW_8 0x0000 /* 8bit */ +#define MBW_16 0x0400 /* 16bit */ +#define MBW_32 0x0800 /* 32bit */ +#define BIGEND 0x0100 /* b8: Big endian mode */ +#define BYTE_LITTLE 0x0000 /* little dendian */ +#define BYTE_BIG 0x0100 /* big endifan */ +#define ISEL 0x0020 /* b5: DCP FIFO port direction select */ +#define CURPIPE 0x000F /* b2-0: PIPE select */ + +/* CFIFO/DxFIFO Port Control Register */ +#define BVAL 0x8000 /* b15: Buffer valid flag */ +#define BCLR 0x4000 /* b14: Buffer clear */ +#define FRDY 0x2000 /* b13: FIFO ready */ +#define DTLN 0x0FFF /* b11-0: FIFO received data length */ + +/* Interrupt Enable Register 0 */ +#define VBSE 0x8000 /* b15: VBUS interrupt */ +#define RSME 0x4000 /* b14: Resume interrupt */ +#define SOFE 0x2000 /* b13: Frame update interrupt */ +#define DVSE 0x1000 /* b12: Device state transition interrupt */ +#define CTRE 0x0800 /* b11: Control transfer stage transition interrupt */ +#define BEMPE 0x0400 /* b10: Buffer empty interrupt */ +#define NRDYE 0x0200 /* b9: Buffer not ready interrupt */ +#define BRDYE 0x0100 /* b8: Buffer ready interrupt */ + +/* Interrupt Enable Register 1 */ +#define OVRCRE 0x8000 /* b15: Over-current interrupt */ +#define BCHGE 0x4000 /* b14: USB us chenge interrupt */ +#define DTCHE 0x1000 /* b12: Detach sense interrupt */ +#define ATTCHE 0x0800 /* b11: Attach sense interrupt */ +#define EOFERRE 0x0040 /* b6: EOF error interrupt */ +#define SIGNE 0x0020 /* b5: SETUP IGNORE interrupt */ +#define SACKE 0x0010 /* b4: SETUP ACK interrupt */ + +/* BRDY Interrupt Enable/Status Register */ +#define BRDY9 0x0200 /* b9: PIPE9 */ +#define BRDY8 0x0100 /* b8: PIPE8 */ +#define BRDY7 0x0080 /* b7: PIPE7 */ +#define BRDY6 0x0040 /* b6: PIPE6 */ +#define BRDY5 0x0020 /* b5: PIPE5 */ +#define BRDY4 0x0010 /* b4: PIPE4 */ +#define BRDY3 0x0008 /* b3: PIPE3 */ +#define BRDY2 0x0004 /* b2: PIPE2 */ +#define BRDY1 0x0002 /* b1: PIPE1 */ +#define BRDY0 0x0001 /* b1: PIPE0 */ + +/* NRDY Interrupt Enable/Status Register */ +#define NRDY9 0x0200 /* b9: PIPE9 */ +#define NRDY8 0x0100 /* b8: PIPE8 */ +#define NRDY7 0x0080 /* b7: PIPE7 */ +#define NRDY6 0x0040 /* b6: PIPE6 */ +#define NRDY5 0x0020 /* b5: PIPE5 */ +#define NRDY4 0x0010 /* b4: PIPE4 */ +#define NRDY3 0x0008 /* b3: PIPE3 */ +#define NRDY2 0x0004 /* b2: PIPE2 */ +#define NRDY1 0x0002 /* b1: PIPE1 */ +#define NRDY0 0x0001 /* b1: PIPE0 */ + +/* BEMP Interrupt Enable/Status Register */ +#define BEMP9 0x0200 /* b9: PIPE9 */ +#define BEMP8 0x0100 /* b8: PIPE8 */ +#define BEMP7 0x0080 /* b7: PIPE7 */ +#define BEMP6 0x0040 /* b6: PIPE6 */ +#define BEMP5 0x0020 /* b5: PIPE5 */ +#define BEMP4 0x0010 /* b4: PIPE4 */ +#define BEMP3 0x0008 /* b3: PIPE3 */ +#define BEMP2 0x0004 /* b2: PIPE2 */ +#define BEMP1 0x0002 /* b1: PIPE1 */ +#define BEMP0 0x0001 /* b0: PIPE0 */ + +/* SOF Pin Configuration Register */ +#define TRNENSEL 0x0100 /* b8: Select transaction enable period */ +#define BRDYM 0x0040 /* b6: BRDY clear timing */ +#define INTL 0x0020 /* b5: Interrupt sense select */ +#define EDGESTS 0x0010 /* b4: */ +#define SOFMODE 0x000C /* b3-2: SOF pin select */ +#define SOF_125US 0x0008 /* SOF OUT 125us Frame Signal */ +#define SOF_1MS 0x0004 /* SOF OUT 1ms Frame Signal */ +#define SOF_DISABLE 0x0000 /* SOF OUT Disable */ + +/* Interrupt Status Register 0 */ +#define VBINT 0x8000 /* b15: VBUS interrupt */ +#define RESM 0x4000 /* b14: Resume interrupt */ +#define SOFR 0x2000 /* b13: SOF frame update interrupt */ +#define DVST 0x1000 /* b12: Device state transition interrupt */ +#define CTRT 0x0800 /* b11: Control transfer stage transition interrupt */ +#define BEMP 0x0400 /* b10: Buffer empty interrupt */ +#define NRDY 0x0200 /* b9: Buffer not ready interrupt */ +#define BRDY 0x0100 /* b8: Buffer ready interrupt */ +#define VBSTS 0x0080 /* b7: VBUS input port */ +#define DVSQ 0x0070 /* b6-4: Device state */ +#define DS_SPD_CNFG 0x0070 /* Suspend Configured */ +#define DS_SPD_ADDR 0x0060 /* Suspend Address */ +#define DS_SPD_DFLT 0x0050 /* Suspend Default */ +#define DS_SPD_POWR 0x0040 /* Suspend Powered */ +#define DS_SUSP 0x0040 /* Suspend */ +#define DS_CNFG 0x0030 /* Configured */ +#define DS_ADDS 0x0020 /* Address */ +#define DS_DFLT 0x0010 /* Default */ +#define DS_POWR 0x0000 /* Powered */ +#define DVSQS 0x0030 /* b5-4: Device state */ +#define VALID 0x0008 /* b3: Setup packet detected flag */ +#define CTSQ 0x0007 /* b2-0: Control transfer stage */ +#define CS_SQER 0x0006 /* Sequence error */ +#define CS_WRND 0x0005 /* Control write nodata status stage */ +#define CS_WRSS 0x0004 /* Control write status stage */ +#define CS_WRDS 0x0003 /* Control write data stage */ +#define CS_RDSS 0x0002 /* Control read status stage */ +#define CS_RDDS 0x0001 /* Control read data stage */ +#define CS_IDST 0x0000 /* Idle or setup stage */ + +/* Interrupt Status Register 1 */ +#define OVRCR 0x8000 /* b15: Over-current interrupt */ +#define BCHG 0x4000 /* b14: USB bus chenge interrupt */ +#define DTCH 0x1000 /* b12: Detach sense interrupt */ +#define ATTCH 0x0800 /* b11: Attach sense interrupt */ +#define EOFERR 0x0040 /* b6: EOF-error interrupt */ +#define SIGN 0x0020 /* b5: Setup ignore interrupt */ +#define SACK 0x0010 /* b4: Setup acknowledge interrupt */ + +/* Frame Number Register */ +#define OVRN 0x8000 /* b15: Overrun error */ +#define CRCE 0x4000 /* b14: Received data error */ +#define FRNM 0x07FF /* b10-0: Frame number */ + +/* Micro Frame Number Register */ +#define UFRNM 0x0007 /* b2-0: Micro frame number */ + +/* Default Control Pipe Maxpacket Size Register */ +/* Pipe Maxpacket Size Register */ +#define DEVSEL 0xF000 /* b15-14: Device address select */ +#define MAXP 0x007F /* b6-0: Maxpacket size of default control pipe */ + +/* Default Control Pipe Control Register */ +#define BSTS 0x8000 /* b15: Buffer status */ +#define SUREQ 0x4000 /* b14: Send USB request */ +#define CSCLR 0x2000 /* b13: complete-split status clear */ +#define CSSTS 0x1000 /* b12: complete-split status */ +#define SUREQCLR 0x0800 /* b11: stop setup request */ +#define SQCLR 0x0100 /* b8: Sequence toggle bit clear */ +#define SQSET 0x0080 /* b7: Sequence toggle bit set */ +#define SQMON 0x0040 /* b6: Sequence toggle bit monitor */ +#define PBUSY 0x0020 /* b5: pipe busy */ +#define PINGE 0x0010 /* b4: ping enable */ +#define CCPL 0x0004 /* b2: Enable control transfer complete */ +#define PID 0x0003 /* b1-0: Response PID */ +#define PID_STALL11 0x0003 /* STALL */ +#define PID_STALL 0x0002 /* STALL */ +#define PID_BUF 0x0001 /* BUF */ +#define PID_NAK 0x0000 /* NAK */ + +/* Pipe Window Select Register */ +#define PIPENM 0x0007 /* b2-0: Pipe select */ + +/* Pipe Configuration Register */ +#define R8A66597_TYP 0xC000 /* b15-14: Transfer type */ +#define R8A66597_ISO 0xC000 /* Isochronous */ +#define R8A66597_INT 0x8000 /* Interrupt */ +#define R8A66597_BULK 0x4000 /* Bulk */ +#define R8A66597_BFRE 0x0400 /* b10: Buffer ready interrupt mode select */ +#define R8A66597_DBLB 0x0200 /* b9: Double buffer mode select */ +#define R8A66597_CNTMD 0x0100 /* b8: Continuous transfer mode select */ +#define R8A66597_SHTNAK 0x0080 /* b7: Transfer end NAK */ +#define R8A66597_DIR 0x0010 /* b4: Transfer direction select */ +#define R8A66597_EPNUM 0x000F /* b3-0: Eendpoint number select */ + +/* Pipe Buffer Configuration Register */ +#define BUFSIZE 0x7C00 /* b14-10: Pipe buffer size */ +#define BUFNMB 0x007F /* b6-0: Pipe buffer number */ +#define PIPE0BUF 256 +#define PIPExBUF 64 + +/* Pipe Maxpacket Size Register */ +#define MXPS 0x07FF /* b10-0: Maxpacket size */ + +/* Pipe Cycle Configuration Register */ +#define IFIS 0x1000 /* b12: Isochronous in-buffer flush mode select */ +#define IITV 0x0007 /* b2-0: Isochronous interval */ + +/* Pipex Control Register */ +#define BSTS 0x8000 /* b15: Buffer status */ +#define INBUFM 0x4000 /* b14: IN buffer monitor (Only for PIPE1 to 5) */ +#define CSCLR 0x2000 /* b13: complete-split status clear */ +#define CSSTS 0x1000 /* b12: complete-split status */ +#define ATREPM 0x0400 /* b10: Auto repeat mode */ +#define ACLRM 0x0200 /* b9: Out buffer auto clear mode */ +#define SQCLR 0x0100 /* b8: Sequence toggle bit clear */ +#define SQSET 0x0080 /* b7: Sequence toggle bit set */ +#define SQMON 0x0040 /* b6: Sequence toggle bit monitor */ +#define PBUSY 0x0020 /* b5: pipe busy */ +#define PID 0x0003 /* b1-0: Response PID */ + +/* PIPExTRE */ +#define TRENB 0x0200 /* b9: Transaction counter enable */ +#define TRCLR 0x0100 /* b8: Transaction counter clear */ + +/* PIPExTRN */ +#define TRNCNT 0xFFFF /* b15-0: Transaction counter */ + +/* DEVADDx */ +#define UPPHUB 0x7800 +#define HUBPORT 0x0700 +#define USBSPD 0x00C0 +#define RTPORT 0x0001 + +#endif /* __LINUX_USB_R8A66597_H */ -- cgit v1.2.3-71-gd317 From 2c59b0b70b9d5d61c726f179724660c4c2423f31 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 22 Jul 2009 14:41:35 +0000 Subject: usb: m66592-udc platform data on_chip support Convert the m66592-udc driver to use the on_chip flag from platform data to enable on chip behaviour instead of relying on CONFIG_SUPERH_BUILT_IN_M66592 ugliness. This makes the code cleaner and also allows us to support both external and internal m66592 with the same kernel. It also makes the Kconfig part more future proof since we with this patch can add support for new processors with on-chip m66592 without modifying the Kconfig. The patch adds a m66592 header file for platform data and ties in platform data to the existing m66592 devices. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-highlander/setup.c | 7 + arch/sh/boards/mach-x3proto/setup.c | 7 + arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 8 +- drivers/usb/gadget/Kconfig | 10 -- drivers/usb/gadget/m66592-udc.c | 252 +++++++++++++++++++-------------- drivers/usb/gadget/m66592-udc.h | 89 ++++++------ include/linux/usb/m66592.h | 44 ++++++ 7 files changed, 257 insertions(+), 160 deletions(-) create mode 100644 include/linux/usb/m66592.h (limited to 'include/linux') diff --git a/arch/sh/boards/mach-highlander/setup.c b/arch/sh/boards/mach-highlander/setup.c index 1639f8915000..566e69d8d729 100644 --- a/arch/sh/boards/mach-highlander/setup.c +++ b/arch/sh/boards/mach-highlander/setup.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,11 @@ static struct platform_device r8a66597_usb_host_device = { .resource = r8a66597_usb_host_resources, }; +static struct m66592_platdata usbf_platdata = { + .xtal = M66592_PLATDATA_XTAL_24MHZ, + .vif = 1, +}; + static struct resource m66592_usb_peripheral_resources[] = { [0] = { .name = "m66592_udc", @@ -81,6 +87,7 @@ static struct platform_device m66592_usb_peripheral_device = { .dev = { .dma_mask = NULL, /* don't use dma */ .coherent_dma_mask = 0xffffffff, + .platform_data = &usbf_platdata, }, .num_resources = ARRAY_SIZE(m66592_usb_peripheral_resources), .resource = m66592_usb_peripheral_resources, diff --git a/arch/sh/boards/mach-x3proto/setup.c b/arch/sh/boards/mach-x3proto/setup.c index 8913ae39a802..efe4cb9f8a77 100644 --- a/arch/sh/boards/mach-x3proto/setup.c +++ b/arch/sh/boards/mach-x3proto/setup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include static struct resource heartbeat_resources[] = { @@ -89,6 +90,11 @@ static struct platform_device r8a66597_usb_host_device = { .resource = r8a66597_usb_host_resources, }; +static struct m66592_platdata usbf_platdata = { + .xtal = M66592_PLATDATA_XTAL_24MHZ, + .vif = 1, +}; + static struct resource m66592_usb_peripheral_resources[] = { [0] = { .name = "m66592_udc", @@ -109,6 +115,7 @@ static struct platform_device m66592_usb_peripheral_device = { .dev = { .dma_mask = NULL, /* don't use dma */ .coherent_dma_mask = 0xffffffff, + .platform_data = &usbf_platdata, }, .num_resources = ARRAY_SIZE(m66592_usb_peripheral_resources), .resource = m66592_usb_peripheral_resources, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index ea524a2da3e4..0bad14a44238 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -47,9 +48,13 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; +static struct m66592_platdata usbf_platdata = { + .on_chip = 1, +}; + static struct resource usbf_resources[] = { [0] = { - .name = "m66592_udc", + .name = "USBF", .start = 0x04480000, .end = 0x044800FF, .flags = IORESOURCE_MEM, @@ -67,6 +72,7 @@ static struct platform_device usbf_device = { .dev = { .dma_mask = NULL, .coherent_dma_mask = 0xffffffff, + .platform_data = &usbf_platdata, }, .num_resources = ARRAY_SIZE(usbf_resources), .resource = usbf_resources, diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 7f8e83a954ac..b7f10bc25c2c 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -360,16 +360,6 @@ config USB_M66592 default USB_GADGET select USB_GADGET_SELECTED -config SUPERH_BUILT_IN_M66592 - boolean "Enable SuperH built-in USB like the M66592" - depends on USB_GADGET_M66592 && CPU_SUBTYPE_SH7722 - help - SH7722 has USB like the M66592. - - The transfer rate is very slow when use "Ethernet Gadget". - However, this problem is improved if change a value of - NET_IP_ALIGN to 4. - # # Controllers available only in discrete form (and all PCI controllers) # diff --git a/drivers/usb/gadget/m66592-udc.c b/drivers/usb/gadget/m66592-udc.c index 0dddd2f8ff35..a61c70caff12 100644 --- a/drivers/usb/gadget/m66592-udc.c +++ b/drivers/usb/gadget/m66592-udc.c @@ -31,38 +31,12 @@ #include "m66592-udc.h" - MODULE_DESCRIPTION("M66592 USB gadget driver"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yoshihiro Shimoda"); MODULE_ALIAS("platform:m66592_udc"); -#define DRIVER_VERSION "26 Jun 2009" - -/* module parameters */ -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) -static unsigned short endian = M66592_LITTLE; -module_param(endian, ushort, 0644); -MODULE_PARM_DESC(endian, "data endian: big=0, little=0 (default=0)"); -#else -static unsigned short clock = M66592_XTAL24; -module_param(clock, ushort, 0644); -MODULE_PARM_DESC(clock, "input clock: 48MHz=32768, 24MHz=16384, 12MHz=0 " - "(default=16384)"); - -static unsigned short vif = M66592_LDRV; -module_param(vif, ushort, 0644); -MODULE_PARM_DESC(vif, "input VIF: 3.3V=32768, 1.5V=0 (default=32768)"); - -static unsigned short endian; -module_param(endian, ushort, 0644); -MODULE_PARM_DESC(endian, "data endian: big=256, little=0 (default=0)"); - -static unsigned short irq_sense = M66592_INTL; -module_param(irq_sense, ushort, 0644); -MODULE_PARM_DESC(irq_sense, "IRQ sense: low level=2, falling edge=0 " - "(default=2)"); -#endif +#define DRIVER_VERSION "21 July 2009" static const char udc_name[] = "m66592_udc"; static const char *m66592_ep_name[] = { @@ -244,6 +218,7 @@ static inline int get_buffer_size(struct m66592 *m66592, u16 pipenum) static inline void pipe_change(struct m66592 *m66592, u16 pipenum) { struct m66592_ep *ep = m66592->pipenum2ep[pipenum]; + unsigned short mbw; if (ep->use_dma) return; @@ -252,7 +227,12 @@ static inline void pipe_change(struct m66592 *m66592, u16 pipenum) ndelay(450); - m66592_bset(m66592, M66592_MBW, ep->fifosel); + if (m66592->pdata->on_chip) + mbw = M66592_MBW_32; + else + mbw = M66592_MBW_16; + + m66592_bset(m66592, mbw, ep->fifosel); } static int pipe_buffer_setting(struct m66592 *m66592, @@ -332,6 +312,7 @@ static void pipe_buffer_release(struct m66592 *m66592, static void pipe_initialize(struct m66592_ep *ep) { struct m66592 *m66592 = ep->m66592; + unsigned short mbw; m66592_mdfy(m66592, 0, M66592_CURPIPE, ep->fifosel); @@ -343,7 +324,12 @@ static void pipe_initialize(struct m66592_ep *ep) ndelay(450); - m66592_bset(m66592, M66592_MBW, ep->fifosel); + if (m66592->pdata->on_chip) + mbw = M66592_MBW_32; + else + mbw = M66592_MBW_16; + + m66592_bset(m66592, mbw, ep->fifosel); } } @@ -359,15 +345,13 @@ static void m66592_ep_setting(struct m66592 *m66592, struct m66592_ep *ep, ep->fifosel = M66592_D0FIFOSEL; ep->fifoctr = M66592_D0FIFOCTR; ep->fifotrn = M66592_D0FIFOTRN; -#if !defined(CONFIG_SUPERH_BUILT_IN_M66592) - } else if (m66592->num_dma == 1) { + } else if (!m66592->pdata->on_chip && m66592->num_dma == 1) { m66592->num_dma++; ep->use_dma = 1; ep->fifoaddr = M66592_D1FIFO; ep->fifosel = M66592_D1FIFOSEL; ep->fifoctr = M66592_D1FIFOCTR; ep->fifotrn = M66592_D1FIFOTRN; -#endif } else { ep->use_dma = 0; ep->fifoaddr = M66592_CFIFO; @@ -612,76 +596,120 @@ static void start_ep0(struct m66592_ep *ep, struct m66592_request *req) } } -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) static void init_controller(struct m66592 *m66592) { - m66592_bset(m66592, M66592_HSE, M66592_SYSCFG); /* High spd */ - m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG); - m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG); - m66592_bset(m66592, M66592_USBE, M66592_SYSCFG); + unsigned int endian; - /* This is a workaound for SH7722 2nd cut */ - m66592_bset(m66592, 0x8000, M66592_DVSTCTR); - m66592_bset(m66592, 0x1000, M66592_TESTMODE); - m66592_bclr(m66592, 0x8000, M66592_DVSTCTR); + if (m66592->pdata->on_chip) { + if (m66592->pdata->endian) + endian = 0; /* big endian */ + else + endian = M66592_LITTLE; /* little endian */ - m66592_bset(m66592, M66592_INTL, M66592_INTENB1); + m66592_bset(m66592, M66592_HSE, M66592_SYSCFG); /* High spd */ + m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG); + m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG); + m66592_bset(m66592, M66592_USBE, M66592_SYSCFG); - m66592_write(m66592, 0, M66592_CFBCFG); - m66592_write(m66592, 0, M66592_D0FBCFG); - m66592_bset(m66592, endian, M66592_CFBCFG); - m66592_bset(m66592, endian, M66592_D0FBCFG); -} -#else /* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */ -static void init_controller(struct m66592 *m66592) -{ - m66592_bset(m66592, (vif & M66592_LDRV) | (endian & M66592_BIGEND), - M66592_PINCFG); - m66592_bset(m66592, M66592_HSE, M66592_SYSCFG); /* High spd */ - m66592_mdfy(m66592, clock & M66592_XTAL, M66592_XTAL, M66592_SYSCFG); + /* This is a workaound for SH7722 2nd cut */ + m66592_bset(m66592, 0x8000, M66592_DVSTCTR); + m66592_bset(m66592, 0x1000, M66592_TESTMODE); + m66592_bclr(m66592, 0x8000, M66592_DVSTCTR); - m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG); - m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG); - m66592_bset(m66592, M66592_USBE, M66592_SYSCFG); + m66592_bset(m66592, M66592_INTL, M66592_INTENB1); + + m66592_write(m66592, 0, M66592_CFBCFG); + m66592_write(m66592, 0, M66592_D0FBCFG); + m66592_bset(m66592, endian, M66592_CFBCFG); + m66592_bset(m66592, endian, M66592_D0FBCFG); + } else { + unsigned int clock, vif, irq_sense; + + if (m66592->pdata->endian) + endian = M66592_BIGEND; /* big endian */ + else + endian = 0; /* little endian */ + + if (m66592->pdata->vif) + vif = M66592_LDRV; /* 3.3v */ + else + vif = 0; /* 1.5v */ + + switch (m66592->pdata->xtal) { + case M66592_PLATDATA_XTAL_12MHZ: + clock = M66592_XTAL12; + break; + case M66592_PLATDATA_XTAL_24MHZ: + clock = M66592_XTAL24; + break; + case M66592_PLATDATA_XTAL_48MHZ: + clock = M66592_XTAL48; + break; + default: + pr_warning("m66592-udc: xtal configuration error\n"); + clock = 0; + } - m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG); + switch (m66592->irq_trigger) { + case IRQF_TRIGGER_LOW: + irq_sense = M66592_INTL; + break; + case IRQF_TRIGGER_FALLING: + irq_sense = 0; + break; + default: + pr_warning("m66592-udc: irq trigger config error\n"); + irq_sense = 0; + } - msleep(3); + m66592_bset(m66592, + (vif & M66592_LDRV) | (endian & M66592_BIGEND), + M66592_PINCFG); + m66592_bset(m66592, M66592_HSE, M66592_SYSCFG); /* High spd */ + m66592_mdfy(m66592, clock & M66592_XTAL, M66592_XTAL, + M66592_SYSCFG); + m66592_bclr(m66592, M66592_USBE, M66592_SYSCFG); + m66592_bclr(m66592, M66592_DPRPU, M66592_SYSCFG); + m66592_bset(m66592, M66592_USBE, M66592_SYSCFG); - m66592_bset(m66592, M66592_RCKE | M66592_PLLC, M66592_SYSCFG); + m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG); + + msleep(3); - msleep(1); + m66592_bset(m66592, M66592_RCKE | M66592_PLLC, M66592_SYSCFG); - m66592_bset(m66592, M66592_SCKE, M66592_SYSCFG); + msleep(1); - m66592_bset(m66592, irq_sense & M66592_INTL, M66592_INTENB1); - m66592_write(m66592, M66592_BURST | M66592_CPU_ADR_RD_WR, - M66592_DMA0CFG); + m66592_bset(m66592, M66592_SCKE, M66592_SYSCFG); + + m66592_bset(m66592, irq_sense & M66592_INTL, M66592_INTENB1); + m66592_write(m66592, M66592_BURST | M66592_CPU_ADR_RD_WR, + M66592_DMA0CFG); + } } -#endif /* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */ static void disable_controller(struct m66592 *m66592) { -#if !defined(CONFIG_SUPERH_BUILT_IN_M66592) - m66592_bclr(m66592, M66592_SCKE, M66592_SYSCFG); - udelay(1); - m66592_bclr(m66592, M66592_PLLC, M66592_SYSCFG); - udelay(1); - m66592_bclr(m66592, M66592_RCKE, M66592_SYSCFG); - udelay(1); - m66592_bclr(m66592, M66592_XCKE, M66592_SYSCFG); -#endif + if (!m66592->pdata->on_chip) { + m66592_bclr(m66592, M66592_SCKE, M66592_SYSCFG); + udelay(1); + m66592_bclr(m66592, M66592_PLLC, M66592_SYSCFG); + udelay(1); + m66592_bclr(m66592, M66592_RCKE, M66592_SYSCFG); + udelay(1); + m66592_bclr(m66592, M66592_XCKE, M66592_SYSCFG); + } } static void m66592_start_xclock(struct m66592 *m66592) { -#if !defined(CONFIG_SUPERH_BUILT_IN_M66592) u16 tmp; - tmp = m66592_read(m66592, M66592_SYSCFG); - if (!(tmp & M66592_XCKE)) - m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG); -#endif + if (!m66592->pdata->on_chip) { + tmp = m66592_read(m66592, M66592_SYSCFG); + if (!(tmp & M66592_XCKE)) + m66592_bset(m66592, M66592_XCKE, M66592_SYSCFG); + } } /*-------------------------------------------------------------------------*/ @@ -1169,8 +1197,7 @@ static irqreturn_t m66592_irq(int irq, void *_m66592) intsts0 = m66592_read(m66592, M66592_INTSTS0); intenb0 = m66592_read(m66592, M66592_INTENB0); -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) - if (!intsts0 && !intenb0) { + if (m66592->pdata->on_chip && !intsts0 && !intenb0) { /* * When USB clock stops, it cannot read register. Even if a * clock stops, the interrupt occurs. So this driver turn on @@ -1180,7 +1207,6 @@ static irqreturn_t m66592_irq(int irq, void *_m66592) intsts0 = m66592_read(m66592, M66592_INTSTS0); intenb0 = m66592_read(m66592, M66592_INTENB0); } -#endif savepipe = m66592_read(m66592, M66592_CFIFOSEL); @@ -1526,9 +1552,11 @@ static int __exit m66592_remove(struct platform_device *pdev) iounmap(m66592->reg); free_irq(platform_get_irq(pdev, 0), m66592); m66592_free_request(&m66592->ep[0].ep, m66592->ep0_req); -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) - clk_disable(m66592->clk); - clk_put(m66592->clk); +#ifdef CONFIG_HAVE_CLK + if (m66592->pdata->on_chip) { + clk_disable(m66592->clk); + clk_put(m66592->clk); + } #endif kfree(m66592); return 0; @@ -1540,11 +1568,10 @@ static void nop_completion(struct usb_ep *ep, struct usb_request *r) static int __init m66592_probe(struct platform_device *pdev) { - struct resource *res; - int irq; + struct resource *res, *ires; void __iomem *reg = NULL; struct m66592 *m66592 = NULL; -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK char clk_name[8]; #endif int ret = 0; @@ -1557,10 +1584,11 @@ static int __init m66592_probe(struct platform_device *pdev) goto clean_up; } - irq = platform_get_irq(pdev, 0); - if (irq < 0) { + ires = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (!ires) { ret = -ENODEV; - pr_err("platform_get_irq error.\n"); + dev_err(&pdev->dev, + "platform_get_resource IORESOURCE_IRQ error.\n"); goto clean_up; } @@ -1571,6 +1599,12 @@ static int __init m66592_probe(struct platform_device *pdev) goto clean_up; } + if (pdev->dev.platform_data == NULL) { + dev_err(&pdev->dev, "no platform data\n"); + ret = -ENODEV; + goto clean_up; + } + /* initialize ucd */ m66592 = kzalloc(sizeof(struct m66592), GFP_KERNEL); if (m66592 == NULL) { @@ -1578,6 +1612,9 @@ static int __init m66592_probe(struct platform_device *pdev) goto clean_up; } + m66592->pdata = pdev->dev.platform_data; + m66592->irq_trigger = ires->flags & IRQF_TRIGGER_MASK; + spin_lock_init(&m66592->lock); dev_set_drvdata(&pdev->dev, m66592); @@ -1595,22 +1632,25 @@ static int __init m66592_probe(struct platform_device *pdev) m66592->timer.data = (unsigned long)m66592; m66592->reg = reg; - ret = request_irq(irq, m66592_irq, IRQF_DISABLED | IRQF_SHARED, + ret = request_irq(ires->start, m66592_irq, IRQF_DISABLED | IRQF_SHARED, udc_name, m66592); if (ret < 0) { pr_err("request_irq error (%d)\n", ret); goto clean_up; } -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) - snprintf(clk_name, sizeof(clk_name), "usbf%d", pdev->id); - m66592->clk = clk_get(&pdev->dev, clk_name); - if (IS_ERR(m66592->clk)) { - dev_err(&pdev->dev, "cannot get clock \"%s\"\n", clk_name); - ret = PTR_ERR(m66592->clk); - goto clean_up2; +#ifdef CONFIG_HAVE_CLK + if (m66592->pdata->on_chip) { + snprintf(clk_name, sizeof(clk_name), "usbf%d", pdev->id); + m66592->clk = clk_get(&pdev->dev, clk_name); + if (IS_ERR(m66592->clk)) { + dev_err(&pdev->dev, "cannot get clock \"%s\"\n", + clk_name); + ret = PTR_ERR(m66592->clk); + goto clean_up2; + } + clk_enable(m66592->clk); } - clk_enable(m66592->clk); #endif INIT_LIST_HEAD(&m66592->gadget.ep_list); m66592->gadget.ep0 = &m66592->ep[0].ep; @@ -1652,12 +1692,14 @@ static int __init m66592_probe(struct platform_device *pdev) return 0; clean_up3: -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) - clk_disable(m66592->clk); - clk_put(m66592->clk); +#ifdef CONFIG_HAVE_CLK + if (m66592->pdata->on_chip) { + clk_disable(m66592->clk); + clk_put(m66592->clk); + } clean_up2: #endif - free_irq(irq, m66592); + free_irq(ires->start, m66592); clean_up: if (m66592) { if (m66592->ep0_req) diff --git a/drivers/usb/gadget/m66592-udc.h b/drivers/usb/gadget/m66592-udc.h index 9a9c2bf9fbd5..8b960deed680 100644 --- a/drivers/usb/gadget/m66592-udc.h +++ b/drivers/usb/gadget/m66592-udc.h @@ -23,10 +23,12 @@ #ifndef __M66592_UDC_H__ #define __M66592_UDC_H__ -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK #include #endif +#include + #define M66592_SYSCFG 0x00 #define M66592_XTAL 0xC000 /* b15-14: Crystal selection */ #define M66592_XTAL48 0x8000 /* 48MHz */ @@ -76,11 +78,11 @@ #define M66592_P_TST_J 0x0001 /* PERI TEST J */ #define M66592_P_TST_NORMAL 0x0000 /* PERI Normal Mode */ -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) +/* built-in registers */ #define M66592_CFBCFG 0x0A #define M66592_D0FBCFG 0x0C #define M66592_LITTLE 0x0100 /* b8: Little endian mode */ -#else +/* external chip case */ #define M66592_PINCFG 0x0A #define M66592_LDRV 0x8000 /* b15: Drive Current Adjust */ #define M66592_BIGEND 0x0100 /* b8: Big endian mode */ @@ -100,8 +102,8 @@ #define M66592_PKTM 0x0020 /* b5: Packet mode */ #define M66592_DENDE 0x0010 /* b4: Dend enable */ #define M66592_OBUS 0x0004 /* b2: OUTbus mode */ -#endif /* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */ +/* common case */ #define M66592_CFIFO 0x10 #define M66592_D0FIFO 0x14 #define M66592_D1FIFO 0x18 @@ -113,13 +115,9 @@ #define M66592_REW 0x4000 /* b14: Buffer rewind */ #define M66592_DCLRM 0x2000 /* b13: DMA buffer clear mode */ #define M66592_DREQE 0x1000 /* b12: DREQ output enable */ -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) -#define M66592_MBW 0x0800 /* b11: Maximum bit width for FIFO */ -#else -#define M66592_MBW 0x0400 /* b10: Maximum bit width for FIFO */ -#define M66592_MBW_8 0x0000 /* 8bit */ -#define M66592_MBW_16 0x0400 /* 16bit */ -#endif /* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */ +#define M66592_MBW_8 0x0000 /* 8bit */ +#define M66592_MBW_16 0x0400 /* 16bit */ +#define M66592_MBW_32 0x0800 /* 32bit */ #define M66592_TRENB 0x0200 /* b9: Transaction counter enable */ #define M66592_TRCLR 0x0100 /* b8: Transaction counter clear */ #define M66592_DEZPM 0x0080 /* b7: Zero-length packet mode */ @@ -480,9 +478,11 @@ struct m66592_ep { struct m66592 { spinlock_t lock; void __iomem *reg; -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) && defined(CONFIG_HAVE_CLK) +#ifdef CONFIG_HAVE_CLK struct clk *clk; #endif + struct m66592_platdata *pdata; + unsigned long irq_trigger; struct usb_gadget gadget; struct usb_gadget_driver *driver; @@ -546,13 +546,13 @@ static inline void m66592_read_fifo(struct m66592 *m66592, { unsigned long fifoaddr = (unsigned long)m66592->reg + offset; -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) - len = (len + 3) / 4; - insl(fifoaddr, buf, len); -#else - len = (len + 1) / 2; - insw(fifoaddr, buf, len); -#endif + if (m66592->pdata->on_chip) { + len = (len + 3) / 4; + insl(fifoaddr, buf, len); + } else { + len = (len + 1) / 2; + insw(fifoaddr, buf, len); + } } static inline void m66592_write(struct m66592 *m66592, u16 val, @@ -566,33 +566,34 @@ static inline void m66592_write_fifo(struct m66592 *m66592, void *buf, unsigned long len) { unsigned long fifoaddr = (unsigned long)m66592->reg + offset; -#if defined(CONFIG_SUPERH_BUILT_IN_M66592) - unsigned long count; - unsigned char *pb; - int i; - - count = len / 4; - outsl(fifoaddr, buf, count); - - if (len & 0x00000003) { - pb = buf + count * 4; - for (i = 0; i < (len & 0x00000003); i++) { - if (m66592_read(m66592, M66592_CFBCFG)) /* little */ - outb(pb[i], fifoaddr + (3 - i)); - else - outb(pb[i], fifoaddr + i); + + if (m66592->pdata->on_chip) { + unsigned long count; + unsigned char *pb; + int i; + + count = len / 4; + outsl(fifoaddr, buf, count); + + if (len & 0x00000003) { + pb = buf + count * 4; + for (i = 0; i < (len & 0x00000003); i++) { + if (m66592_read(m66592, M66592_CFBCFG)) /* le */ + outb(pb[i], fifoaddr + (3 - i)); + else + outb(pb[i], fifoaddr + i); + } + } + } else { + unsigned long odd = len & 0x0001; + + len = len / 2; + outsw(fifoaddr, buf, len); + if (odd) { + unsigned char *p = buf + len*2; + outb(*p, fifoaddr); } } -#else - unsigned long odd = len & 0x0001; - - len = len / 2; - outsw(fifoaddr, buf, len); - if (odd) { - unsigned char *p = buf + len*2; - outb(*p, fifoaddr); - } -#endif /* #if defined(CONFIG_SUPERH_BUILT_IN_M66592) */ } static inline void m66592_mdfy(struct m66592 *m66592, u16 val, u16 pat, diff --git a/include/linux/usb/m66592.h b/include/linux/usb/m66592.h new file mode 100644 index 000000000000..cda9625e7df0 --- /dev/null +++ b/include/linux/usb/m66592.h @@ -0,0 +1,44 @@ +/* + * M66592 driver platform data + * + * Copyright (C) 2009 Renesas Solutions Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef __LINUX_USB_M66592_H +#define __LINUX_USB_M66592_H + +#define M66592_PLATDATA_XTAL_12MHZ 0x01 +#define M66592_PLATDATA_XTAL_24MHZ 0x02 +#define M66592_PLATDATA_XTAL_48MHZ 0x03 + +struct m66592_platdata { + /* one = on chip controller, zero = external controller */ + unsigned on_chip:1; + + /* one = big endian, zero = little endian */ + unsigned endian:1; + + /* (external controller only) M66592_PLATDATA_XTAL_nnMHZ */ + unsigned xtal:2; + + /* (external controller only) one = 3.3V, zero = 1.5V */ + unsigned vif:1; + +}; + +#endif /* __LINUX_USB_M66592_H */ + -- cgit v1.2.3-71-gd317 From d9ab77161d811ffb0bccf396f7155cc905c1b9e1 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 22 Jul 2009 00:37:25 +0200 Subject: Driver Core: Make PM operations a const pointer They are not supposed to be modified by drivers, so make them const. Signed-off-by: Dmitry Torokhov Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 2 +- drivers/base/power/main.c | 8 +++++--- drivers/pci/pci-driver.c | 16 ++++++++-------- include/linux/device.h | 9 +++++---- 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 455e55971d0e..ae5c4aa6d269 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -889,7 +889,7 @@ static int platform_pm_restore_noirq(struct device *dev) #endif /* !CONFIG_HIBERNATION */ -static struct dev_pm_ops platform_dev_pm_ops = { +static const struct dev_pm_ops platform_dev_pm_ops = { .prepare = platform_pm_prepare, .complete = platform_pm_complete, .suspend = platform_pm_suspend, diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 58a3e572f2c9..1b1a786b7dec 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -157,8 +157,9 @@ void device_pm_move_last(struct device *dev) * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ -static int pm_op(struct device *dev, struct dev_pm_ops *ops, - pm_message_t state) +static int pm_op(struct device *dev, + const struct dev_pm_ops *ops, + pm_message_t state) { int error = 0; @@ -220,7 +221,8 @@ static int pm_op(struct device *dev, struct dev_pm_ops *ops, * The operation is executed with interrupts disabled by the only remaining * functional CPU in the system. */ -static int pm_noirq_op(struct device *dev, struct dev_pm_ops *ops, +static int pm_noirq_op(struct device *dev, + const struct dev_pm_ops *ops, pm_message_t state) { int error = 0; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d76c4c85367e..0c2ea44ae5e1 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -575,7 +575,7 @@ static void pci_pm_complete(struct device *dev) static int pci_pm_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); @@ -613,7 +613,7 @@ static int pci_pm_suspend(struct device *dev) static int pci_pm_suspend_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev, PMSG_SUSPEND); @@ -672,7 +672,7 @@ static int pci_pm_resume_noirq(struct device *dev) static int pci_pm_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; /* @@ -711,7 +711,7 @@ static int pci_pm_resume(struct device *dev) static int pci_pm_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_FREEZE); @@ -780,7 +780,7 @@ static int pci_pm_thaw_noirq(struct device *dev) static int pci_pm_thaw(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; if (pci_has_legacy_pm_support(pci_dev)) @@ -799,7 +799,7 @@ static int pci_pm_thaw(struct device *dev) static int pci_pm_poweroff(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_HIBERNATE); @@ -872,7 +872,7 @@ static int pci_pm_restore_noirq(struct device *dev) static int pci_pm_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; /* @@ -910,7 +910,7 @@ static int pci_pm_restore(struct device *dev) #endif /* !CONFIG_HIBERNATION */ -struct dev_pm_ops pci_dev_pm_ops = { +const struct dev_pm_ops pci_dev_pm_ops = { .prepare = pci_pm_prepare, .complete = pci_pm_complete, .suspend = pci_pm_suspend, diff --git a/include/linux/device.h b/include/linux/device.h index aebb81036db2..a28642975053 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -62,7 +62,7 @@ struct bus_type { int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; struct bus_type_private *p; }; @@ -132,7 +132,7 @@ struct device_driver { int (*resume) (struct device *dev); struct attribute_group **groups; - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; struct driver_private *p; }; @@ -200,7 +200,8 @@ struct class { int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; + struct class_private *p; }; @@ -291,7 +292,7 @@ struct device_type { char *(*nodename)(struct device *dev); void (*release)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; }; /* interface for exporting device attributes */ -- cgit v1.2.3-71-gd317 From 8150f32b90f630ad3e460f026ce338cb81685bc9 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 24 Jul 2009 22:11:32 -0700 Subject: Driver Core: Make PM operations a const pointer They are not supposed to be modified by drivers, so make them const. Signed-off-by: Dmitry Torokhov Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 2 +- drivers/base/power/main.c | 8 +++++--- drivers/pci/pci-driver.c | 16 ++++++++-------- include/linux/device.h | 9 +++++---- 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 81cb01bfc356..c4b3fcee17b2 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -925,7 +925,7 @@ static int platform_pm_restore_noirq(struct device *dev) #endif /* !CONFIG_HIBERNATION */ -static struct dev_pm_ops platform_dev_pm_ops = { +static const struct dev_pm_ops platform_dev_pm_ops = { .prepare = platform_pm_prepare, .complete = platform_pm_complete, .suspend = platform_pm_suspend, diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 58a3e572f2c9..1b1a786b7dec 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -157,8 +157,9 @@ void device_pm_move_last(struct device *dev) * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ -static int pm_op(struct device *dev, struct dev_pm_ops *ops, - pm_message_t state) +static int pm_op(struct device *dev, + const struct dev_pm_ops *ops, + pm_message_t state) { int error = 0; @@ -220,7 +221,8 @@ static int pm_op(struct device *dev, struct dev_pm_ops *ops, * The operation is executed with interrupts disabled by the only remaining * functional CPU in the system. */ -static int pm_noirq_op(struct device *dev, struct dev_pm_ops *ops, +static int pm_noirq_op(struct device *dev, + const struct dev_pm_ops *ops, pm_message_t state) { int error = 0; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d76c4c85367e..0c2ea44ae5e1 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -575,7 +575,7 @@ static void pci_pm_complete(struct device *dev) static int pci_pm_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); @@ -613,7 +613,7 @@ static int pci_pm_suspend(struct device *dev) static int pci_pm_suspend_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev, PMSG_SUSPEND); @@ -672,7 +672,7 @@ static int pci_pm_resume_noirq(struct device *dev) static int pci_pm_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; /* @@ -711,7 +711,7 @@ static int pci_pm_resume(struct device *dev) static int pci_pm_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_FREEZE); @@ -780,7 +780,7 @@ static int pci_pm_thaw_noirq(struct device *dev) static int pci_pm_thaw(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; if (pci_has_legacy_pm_support(pci_dev)) @@ -799,7 +799,7 @@ static int pci_pm_thaw(struct device *dev) static int pci_pm_poweroff(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_HIBERNATE); @@ -872,7 +872,7 @@ static int pci_pm_restore_noirq(struct device *dev) static int pci_pm_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); - struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; /* @@ -910,7 +910,7 @@ static int pci_pm_restore(struct device *dev) #endif /* !CONFIG_HIBERNATION */ -struct dev_pm_ops pci_dev_pm_ops = { +const struct dev_pm_ops pci_dev_pm_ops = { .prepare = pci_pm_prepare, .complete = pci_pm_complete, .suspend = pci_pm_suspend, diff --git a/include/linux/device.h b/include/linux/device.h index aebb81036db2..a28642975053 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -62,7 +62,7 @@ struct bus_type { int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; struct bus_type_private *p; }; @@ -132,7 +132,7 @@ struct device_driver { int (*resume) (struct device *dev); struct attribute_group **groups; - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; struct driver_private *p; }; @@ -200,7 +200,8 @@ struct class { int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; + struct class_private *p; }; @@ -291,7 +292,7 @@ struct device_type { char *(*nodename)(struct device *dev); void (*release)(struct device *dev); - struct dev_pm_ops *pm; + const struct dev_pm_ops *pm; }; /* interface for exporting device attributes */ -- cgit v1.2.3-71-gd317 From 633aae23ff31bef692a70772652e753a0ae59b81 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 22 Jul 2009 21:51:36 -0700 Subject: Input: serio - switch to using dev_pm_ops Signed-off-by: Dmitry Torokhov --- drivers/input/serio/serio.c | 23 +++++++++++------------ include/linux/serio.h | 2 -- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c index d66f4944f2a0..0236f0d5fd91 100644 --- a/drivers/input/serio/serio.c +++ b/drivers/input/serio/serio.c @@ -931,15 +931,11 @@ static int serio_uevent(struct device *dev, struct kobj_uevent_env *env) #endif /* CONFIG_HOTPLUG */ #ifdef CONFIG_PM -static int serio_suspend(struct device *dev, pm_message_t state) +static int serio_suspend(struct device *dev) { struct serio *serio = to_serio_port(dev); - if (!serio->suspended && state.event == PM_EVENT_SUSPEND) - serio_cleanup(serio); - - serio->suspended = state.event == PM_EVENT_SUSPEND || - state.event == PM_EVENT_FREEZE; + serio_cleanup(serio); return 0; } @@ -952,13 +948,17 @@ static int serio_resume(struct device *dev) * Driver reconnect can take a while, so better let kseriod * deal with it. */ - if (serio->suspended) { - serio->suspended = false; - serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT); - } + serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT); return 0; } + +static const struct dev_pm_ops serio_pm_ops = { + .suspend = serio_suspend, + .resume = serio_resume, + .poweroff = serio_suspend, + .restore = serio_resume, +}; #endif /* CONFIG_PM */ /* called from serio_driver->connect/disconnect methods under serio_mutex */ @@ -1015,8 +1015,7 @@ static struct bus_type serio_bus = { .remove = serio_driver_remove, .shutdown = serio_shutdown, #ifdef CONFIG_PM - .suspend = serio_suspend, - .resume = serio_resume, + .pm = &serio_pm_ops, #endif }; diff --git a/include/linux/serio.h b/include/linux/serio.h index 126d24c9eaa8..a640bc2afe76 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -31,8 +31,6 @@ struct serio { bool manual_bind; bool registered; /* port has been fully registered with driver core */ - bool suspended; /* port is suspended */ - struct serio_device_id id; -- cgit v1.2.3-71-gd317 From 0c193054a4c1cf190d2f23e5e91bd14402e43912 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 27 Jul 2009 19:09:19 -0400 Subject: nfsd41: hange from page to memory based drc limits NFSD_SLOT_CACHE_SIZE is the size of all encoded operation responses (excluding the sequence operation) that we want to cache. For now, keep NFSD_SLOT_CACHE_SIZE at PAGE_SIZE. It will be reduced when the DRC is changed from page based to memory based. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 28 +++++++++++++--------------- fs/nfsd/nfssvc.c | 13 ++++++------- include/linux/nfsd/nfsd.h | 4 ++-- include/linux/nfsd/state.h | 1 + 4 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 70cba3fbfa6d..e2b11b1b515c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -414,31 +414,31 @@ gen_sessionid(struct nfsd4_session *ses) /* * Give the client the number of slots it requests bound by - * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. + * NFSD_MAX_SLOTS_PER_SESSION and by nfsd_drc_max_mem. * - * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we - * should (up to a point) re-negotiate active sessions and reduce their - * slot usage to make rooom for new connections. For now we just fail the - * create session. + * If we run out of reserved DRC memory we should (up to a point) re-negotiate + * active sessions and reduce their slot usage to make rooom for new + * connections. For now we just fail the create session. */ static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) { - int np; + int mem; if (fchan->maxreqs < 1) return nfserr_inval; else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; - np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; + mem = fchan->maxreqs * NFSD_SLOT_CACHE_SIZE; spin_lock(&nfsd_drc_lock); - if (np + nfsd_drc_pages_used > nfsd_drc_max_pages) - np = nfsd_drc_max_pages - nfsd_drc_pages_used; - nfsd_drc_pages_used += np; + if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) + mem = ((nfsd_drc_max_mem - nfsd_drc_mem_used) / + NFSD_SLOT_CACHE_SIZE) * NFSD_SLOT_CACHE_SIZE; + nfsd_drc_mem_used += mem; spin_unlock(&nfsd_drc_lock); - fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; + fchan->maxreqs = mem / NFSD_SLOT_CACHE_SIZE; if (fchan->maxreqs == 0) return nfserr_resource; return 0; @@ -465,9 +465,7 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp, fchan->maxresp_sz = maxcount; session_fchan->maxresp_sz = fchan->maxresp_sz; - /* Set the max response cached size our default which is - * a multiple of PAGE_SIZE and small */ - session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; + session_fchan->maxresp_cached = NFSD_SLOT_CACHE_SIZE; fchan->maxresp_cached = session_fchan->maxresp_cached; /* Use the client's maxops if possible */ @@ -585,7 +583,7 @@ free_session(struct kref *kref) nfsd4_release_respages(e->ce_respages, e->ce_resused); } spin_lock(&nfsd_drc_lock); - nfsd_drc_pages_used -= ses->se_fchannel.maxreqs * NFSD_PAGES_PER_SLOT; + nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE; spin_unlock(&nfsd_drc_lock); kfree(ses); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9be2a1932f8a..5a280a9cb540 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -74,8 +74,8 @@ struct svc_serv *nfsd_serv; * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ spinlock_t nfsd_drc_lock; -unsigned int nfsd_drc_max_pages; -unsigned int nfsd_drc_pages_used; +unsigned int nfsd_drc_max_mem; +unsigned int nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; @@ -247,12 +247,11 @@ void nfsd_reset_versions(void) static void set_max_drc(void) { #define NFSD_DRC_SIZE_SHIFT 10 - nfsd_drc_max_pages = nr_free_buffer_pages() - >> NFSD_DRC_SIZE_SHIFT; - nfsd_drc_pages_used = 0; + nfsd_drc_max_mem = (nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; + nfsd_drc_mem_used = 0; spin_lock_init(&nfsd_drc_lock); - dprintk("%s nfsd_drc_max_pages %u\n", __func__, - nfsd_drc_max_pages); + dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); } int nfsd_create_serv(void) diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 2571f856908f..2812ed52669d 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -57,8 +57,8 @@ extern u32 nfsd_supported_minorversion; extern struct mutex nfsd_mutex; extern struct svc_serv *nfsd_serv; extern spinlock_t nfsd_drc_lock; -extern unsigned int nfsd_drc_max_pages; -extern unsigned int nfsd_drc_pages_used; +extern unsigned int nfsd_drc_max_mem; +extern unsigned int nfsd_drc_mem_used; extern struct seq_operations nfs_exports_op; diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 57ab2ed08459..a6c87d623891 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -96,6 +96,7 @@ struct nfs4_cb_conn { #define NFSD_MAX_SLOTS_PER_SESSION 128 /* Maximum number of pages per slot cache entry */ #define NFSD_PAGES_PER_SLOT 1 +#define NFSD_SLOT_CACHE_SIZE PAGE_SIZE /* Maximum number of operations per session compound */ #define NFSD_MAX_OPS_PER_COMPOUND 16 -- cgit v1.2.3-71-gd317 From 49557cc74c7bdf6a984be227ead9a84b3a26f053 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 23 Jul 2009 19:02:16 -0400 Subject: nfsd41: Use separate DRC for setclientid Instead of trying to share the generic 4.1 reply cache code for the CREATE_SESSION reply cache, it's simpler to handle CREATE_SESSION separately. The nfs41 single slot clientid DRC holds the results of create session processing. CREATE_SESSION can be preceeded by a SEQUENCE operation (an embedded CREATE_SESSION) and the create session single slot cache must be maintained. nfsd4_replay_cache_entry() and nfsd4_store_cache_entry() do not implement the replay of an embedded CREATE_SESSION. The clientid DRC slot does not need the inuse, cachethis or other fields that the multiple slot session cache uses. Replace the clientid DRC cache struct nfs4_slot cache with a new nfsd4_clid_slot cache. Save the xdr struct nfsd4_create_session into the cache at the end of processing, and on a replay, replace the struct for the replay request with the cached version all while under the state lock. nfsd4_proc_compound will handle both the solo and embedded CREATE_SESSION case via the normal use of encode_operation. Errors that do not change the create session cache: A create session NFS4ERR_STALE_CLIENTID error means that a client record (and associated create session slot) could not be found and therefore can't be changed. NFSERR_SEQ_MISORDERED errors do not change the slot cache. All other errors get cached. Remove the clientid DRC specific check in nfs4svc_encode_compoundres to put the session only if cstate.session is set which will now always be true. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4state.c | 64 +++++++++++++++++++++++++++------------------- fs/nfsd/nfs4xdr.c | 3 +-- include/linux/nfsd/state.h | 21 ++++++++++++++- include/linux/nfsd/xdr4.h | 12 --------- 5 files changed, 60 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d781658e8084..d606c6a427de 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1120,7 +1120,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, BUG_ON(op->status == nfs_ok); encode_op: - /* Only from SEQUENCE or CREATE_SESSION */ + /* Only from SEQUENCE */ if (resp->cstate.status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); if (nfsd4_not_cached(resp)) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 99df8e7a687b..7729d092c8a5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -653,8 +653,6 @@ static inline void free_client(struct nfs4_client *clp) { shutdown_callback_client(clp); - nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, - clp->cl_slot.sl_cache_entry.ce_resused); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -1293,12 +1291,11 @@ out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; exid->clientid.cl_id = new->cl_clientid.cl_id; - new->cl_slot.sl_seqid = 0; exid->seqid = 1; nfsd4_set_ex_flags(new, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", - new->cl_slot.sl_seqid, new->cl_exchange_flags); + new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); status = nfs_ok; out: @@ -1334,15 +1331,35 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) return nfserr_seq_misordered; } +/* + * Cache the create session result into the create session single DRC + * slot cache by saving the xdr structure. sl_seqid has been set. + * Do this for solo or embedded create session operations. + */ +static void +nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot, int nfserr) +{ + slot->sl_status = nfserr; + memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); +} + +static __be32 +nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, + struct nfsd4_clid_slot *slot) +{ + memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses)); + return slot->sl_status; +} + __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_create_session *cr_ses) { u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; - struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_client *conf, *unconf; - struct nfsd4_slot *slot = NULL; + struct nfsd4_clid_slot *cs_slot = NULL; int status = 0; nfs4_lock_state(); @@ -1350,25 +1367,22 @@ nfsd4_create_session(struct svc_rqst *rqstp, conf = find_confirmed_client(&cr_ses->clientid); if (conf) { - slot = &conf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot->sl_seqid, - slot->sl_inuse); + cs_slot = &conf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status == nfserr_replay_cache) { dprintk("Got a create_session replay! seqid= %d\n", - slot->sl_seqid); - cstate->slot = slot; - cstate->status = status; + cs_slot->sl_seqid); /* Return the cached reply status */ - status = nfsd4_replay_cache_entry(resp, NULL); + status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out; - } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { + } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { status = nfserr_seq_misordered; dprintk("Sequence misordered!\n"); dprintk("Expected seqid= %d but got seqid= %d\n", - slot->sl_seqid, cr_ses->seqid); + cs_slot->sl_seqid, cr_ses->seqid); goto out; } - conf->cl_slot.sl_seqid++; + cs_slot->sl_seqid++; } else if (unconf) { if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || (ip_addr != unconf->cl_addr)) { @@ -1376,16 +1390,15 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out; } - slot = &unconf->cl_slot; - status = check_slot_seqid(cr_ses->seqid, slot->sl_seqid, - slot->sl_inuse); + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { /* an unconfirmed replay returns misordered */ status = nfserr_seq_misordered; - goto out; + goto out_cache; } - slot->sl_seqid++; /* from 0 to 1 */ + cs_slot->sl_seqid++; /* from 0 to 1 */ move_to_confirmed(unconf); /* @@ -1406,12 +1419,11 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cr_ses->seqid = slot->sl_seqid; + cr_ses->seqid = cs_slot->sl_seqid; - slot->sl_inuse = true; - cstate->slot = slot; - /* Ensure a page is used for the cache */ - slot->sl_cache_entry.ce_cachethis = 1; +out_cache: + /* cache solo and embedded create sessions under the state lock */ + nfsd4_cache_create_session(cr_ses, cs_slot, status); out: nfs4_unlock_state(); dprintk("%s returns %d\n", __func__, ntohl(status)); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2dcc7feaa6ff..fdf632bf1cfe 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3313,8 +3313,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); resp->cstate.slot->sl_inuse = 0; } - if (resp->cstate.session) - nfsd4_put_session(resp->cstate.session); + nfsd4_put_session(resp->cstate.session); } return 1; } diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index a6c87d623891..58bb19784e12 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -127,6 +127,25 @@ struct nfsd4_channel_attrs { u32 rdma_attrs; }; +struct nfsd4_create_session { + clientid_t clientid; + struct nfs4_sessionid sessionid; + u32 seqid; + u32 flags; + struct nfsd4_channel_attrs fore_channel; + struct nfsd4_channel_attrs back_channel; + u32 callback_prog; + u32 uid; + u32 gid; +}; + +/* The single slot clientid cache structure */ +struct nfsd4_clid_slot { + u32 sl_seqid; + __be32 sl_status; + struct nfsd4_create_session sl_cr_ses; +}; + struct nfsd4_session { struct kref se_ref; struct list_head se_hash; /* hash by sessionid */ @@ -193,7 +212,7 @@ struct nfs4_client { /* for nfs41 */ struct list_head cl_sessions; - struct nfsd4_slot cl_slot; /* create_session slot */ + struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ u32 cl_exchange_flags; struct nfs4_sessionid cl_sessionid; }; diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h index 2bacf7535069..5e4beb0deb80 100644 --- a/include/linux/nfsd/xdr4.h +++ b/include/linux/nfsd/xdr4.h @@ -366,18 +366,6 @@ struct nfsd4_exchange_id { int spa_how; }; -struct nfsd4_create_session { - clientid_t clientid; - struct nfs4_sessionid sessionid; - u32 seqid; - u32 flags; - struct nfsd4_channel_attrs fore_channel; - struct nfsd4_channel_attrs back_channel; - u32 callback_prog; - u32 uid; - u32 gid; -}; - struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ -- cgit v1.2.3-71-gd317 From e5f5ccb646bc6009572b5c23201b5e81638ff150 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 23 Jul 2009 20:35:53 +0200 Subject: power_supply: get_by_name and set_charged functionality This adds a function that indicates that a battery is fully charged. It also includes functions to get a power_supply device from the class of registered devices by name reference. These can be used to find a specific battery to call power_supply_set_battery_charged() on. Some battery drivers might need this information to calibrate themselves. Signed-off-by: Daniel Mack Cc: Ian Molton Cc: Anton Vorontsov Cc: Matt Reimer Signed-off-by: Anton Vorontsov --- drivers/power/power_supply_core.c | 28 ++++++++++++++++++++++++++++ include/linux/power_supply.h | 3 +++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index 12cd6e36ff1d..cce75b40b435 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -116,6 +116,34 @@ int power_supply_is_system_supplied(void) } EXPORT_SYMBOL_GPL(power_supply_is_system_supplied); +int power_supply_set_battery_charged(struct power_supply *psy) +{ + if (psy->type == POWER_SUPPLY_TYPE_BATTERY && psy->set_charged) { + psy->set_charged(psy); + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(power_supply_set_battery_charged); + +static int power_supply_match_device_by_name(struct device *dev, void *data) +{ + const char *name = data; + struct power_supply *psy = dev_get_drvdata(dev); + + return strcmp(psy->name, name) == 0; +} + +struct power_supply *power_supply_get_by_name(char *name) +{ + struct device *dev = class_find_device(power_supply_class, NULL, name, + power_supply_match_device_by_name); + + return dev ? dev_get_drvdata(dev) : NULL; +} +EXPORT_SYMBOL_GPL(power_supply_get_by_name); + int power_supply_register(struct device *parent, struct power_supply *psy) { int rc = 0; diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 4c7c6fc35487..b5d096d3a9be 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -144,6 +144,7 @@ struct power_supply { enum power_supply_property psp, union power_supply_propval *val); void (*external_power_changed)(struct power_supply *psy); + void (*set_charged)(struct power_supply *psy); /* For APM emulation, think legacy userspace. */ int use_for_apm; @@ -183,8 +184,10 @@ struct power_supply_info { int use_for_apm; }; +extern struct power_supply *power_supply_get_by_name(char *name); extern void power_supply_changed(struct power_supply *psy); extern int power_supply_am_i_supplied(struct power_supply *psy); +extern int power_supply_set_battery_charged(struct power_supply *psy); #if defined(CONFIG_POWER_SUPPLY) || defined(CONFIG_POWER_SUPPLY_MODULE) extern int power_supply_is_system_supplied(void); -- cgit v1.2.3-71-gd317 From 78ddb2785980fc01b129c3547463266cae9c6ca9 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 30 Jul 2009 23:23:34 +0100 Subject: ARM: PL093: Header file for PL093 SSMC PrimeCell Header to define the standard registers for an PL093 SSMC memory controller. Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks --- include/linux/amba/pl093.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 include/linux/amba/pl093.h (limited to 'include/linux') diff --git a/include/linux/amba/pl093.h b/include/linux/amba/pl093.h new file mode 100644 index 000000000000..2983e3671adb --- /dev/null +++ b/include/linux/amba/pl093.h @@ -0,0 +1,80 @@ +/* linux/amba/pl093.h + * + * Copyright (c) 2008 Simtec Electronics + * http://armlinux.simtec.co.uk/ + * Ben Dooks + * + * AMBA PL093 SSMC (synchronous static memory controller) + * See DDI0236.pdf (r0p4) for more details + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define SMB_BANK(x) ((x) * 0x20) /* each bank control set is 0x20 apart */ + +/* Offsets for SMBxxxxRy registers */ + +#define SMBIDCYR (0x00) +#define SMBWSTRDR (0x04) +#define SMBWSTWRR (0x08) +#define SMBWSTOENR (0x0C) +#define SMBWSTWENR (0x10) +#define SMBCR (0x14) +#define SMBSR (0x18) +#define SMBWSTBRDR (0x1C) + +/* Masks for SMB registers */ +#define IDCY_MASK (0xf) +#define WSTRD_MASK (0xf) +#define WSTWR_MASK (0xf) +#define WSTOEN_MASK (0xf) +#define WSTWEN_MASK (0xf) + +/* Notes from datasheet: + * WSTOEN <= WSTRD + * WSTWEN <= WSTWR + * + * WSTOEN is not used with nWAIT + */ + +/* SMBCR bit definitions */ +#define SMBCR_BIWRITEEN (1 << 21) +#define SMBCR_ADDRVALIDWRITEEN (1 << 20) +#define SMBCR_SYNCWRITE (1 << 17) +#define SMBCR_BMWRITE (1 << 16) +#define SMBCR_WRAPREAD (1 << 14) +#define SMBCR_BIREADEN (1 << 13) +#define SMBCR_ADDRVALIDREADEN (1 << 12) +#define SMBCR_SYNCREAD (1 << 9) +#define SMBCR_BMREAD (1 << 8) +#define SMBCR_SMBLSPOL (1 << 6) +#define SMBCR_WP (1 << 3) +#define SMBCR_WAITEN (1 << 2) +#define SMBCR_WAITPOL (1 << 1) +#define SMBCR_RBLE (1 << 0) + +#define SMBCR_BURSTLENWRITE_MASK (3 << 18) +#define SMBCR_BURSTLENWRITE_4 (0 << 18) +#define SMBCR_BURSTLENWRITE_8 (1 << 18) +#define SMBCR_BURSTLENWRITE_RESERVED (2 << 18) +#define SMBCR_BURSTLENWRITE_CONTINUOUS (3 << 18) + +#define SMBCR_BURSTLENREAD_MASK (3 << 10) +#define SMBCR_BURSTLENREAD_4 (0 << 10) +#define SMBCR_BURSTLENREAD_8 (1 << 10) +#define SMBCR_BURSTLENREAD_16 (2 << 10) +#define SMBCR_BURSTLENREAD_CONTINUOUS (3 << 10) + +#define SMBCR_MW_MASK (3 << 4) +#define SMBCR_MW_8BIT (0 << 4) +#define SMBCR_MW_16BIT (1 << 4) +#define SMBCR_MW_M32BIT (2 << 4) + +/* SSMC status registers */ +#define SSMCCSR (0x200) +#define SSMCCR (0x204) +#define SSMCITCR (0x208) +#define SSMCITIP (0x20C) +#define SSMCITIOP (0x210) -- cgit v1.2.3-71-gd317 From ff663cf8705bea101d5f73cf471855c85242575e Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Thu, 23 Jul 2009 17:25:49 +0100 Subject: agp: Add generic support for graphics dma remapping New driver hooks for support graphics memory dma remapping are introduced in this patch. It makes generic code can tell if current device needs dma remapping, then call driver provided interfaces for mapping and unmapping. Change has also been made to handle scratch_page in remapping case. Signed-off-by: Zhenyu Wang Signed-off-by: David Woodhouse --- drivers/char/agp/agp.h | 6 ++++++ drivers/char/agp/backend.c | 20 ++++++++++++++++++++ drivers/char/agp/generic.c | 9 +++++++++ include/linux/agp_backend.h | 6 +++++- 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index ce110a3bf298..17e6d0d3ba36 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -121,6 +121,11 @@ struct agp_bridge_driver { void (*agp_destroy_pages)(struct agp_memory *); int (*agp_type_to_mask_type) (struct agp_bridge_data *, int); void (*chipset_flush)(struct agp_bridge_data *); + + int (*agp_map_page)(void *addr, dma_addr_t *ret); + void (*agp_unmap_page)(void *addr, dma_addr_t dma); + int (*agp_map_memory)(struct agp_memory *mem); + void (*agp_unmap_memory)(struct agp_memory *mem); }; struct agp_bridge_data { @@ -135,6 +140,7 @@ struct agp_bridge_data { u32 *gatt_table_real; unsigned long scratch_page; unsigned long scratch_page_real; + dma_addr_t scratch_page_dma; unsigned long gart_bus_addr; unsigned long gatt_bus_addr; u32 mode; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 3bd7e503de41..19ac3663acdc 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -152,6 +152,15 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) bridge->scratch_page_real = phys_to_gart(page_to_phys(page)); bridge->scratch_page = bridge->driver->mask_memory(bridge, phys_to_gart(page_to_phys(page)), 0); + + if (bridge->driver->agp_map_page && + bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)), + &bridge->scratch_page_dma)) { + dev_err(&bridge->dev->dev, + "unable to dma-map scratch page\n"); + rc = -ENOMEM; + goto err_out_nounmap; + } } size_value = bridge->driver->fetch_size(); @@ -191,6 +200,13 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) return 0; err_out: + if (bridge->driver->needs_scratch_page && + bridge->driver->agp_unmap_page) { + void *va = gart_to_virt(bridge->scratch_page_real); + + bridge->driver->agp_unmap_page(va, bridge->scratch_page_dma); + } +err_out_nounmap: if (bridge->driver->needs_scratch_page) { void *va = gart_to_virt(bridge->scratch_page_real); @@ -221,6 +237,10 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge) bridge->driver->needs_scratch_page) { void *va = gart_to_virt(bridge->scratch_page_real); + if (bridge->driver->agp_unmap_page) + bridge->driver->agp_unmap_page(va, + bridge->scratch_page_dma); + bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_UNMAP); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_FREE); } diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index a3bcc7ef42f9..28f0208c66a6 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -437,6 +437,12 @@ int agp_bind_memory(struct agp_memory *curr, off_t pg_start) curr->bridge->driver->cache_flush(); curr->is_flushed = true; } + + if (curr->bridge->driver->agp_map_memory) { + ret_val = curr->bridge->driver->agp_map_memory(curr); + if (ret_val) + return ret_val; + } ret_val = curr->bridge->driver->insert_memory(curr, pg_start, curr->type); if (ret_val != 0) @@ -478,6 +484,9 @@ int agp_unbind_memory(struct agp_memory *curr) if (ret_val != 0) return ret_val; + if (curr->bridge->driver->agp_unmap_memory) + curr->bridge->driver->agp_unmap_memory(curr); + curr->is_bound = false; curr->pg_start = 0; spin_lock(&curr->bridge->mapped_lock); diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h index 76fa794fdac0..8a294d65b9b1 100644 --- a/include/linux/agp_backend.h +++ b/include/linux/agp_backend.h @@ -79,9 +79,13 @@ struct agp_memory { u32 physical; bool is_bound; bool is_flushed; - bool vmalloc_flag; + bool vmalloc_flag; + bool sg_vmalloc_flag; /* list of agp_memory mapped to the aperture */ struct list_head mapped_list; + /* DMA-mapped addresses */ + struct scatterlist *sg_list; + int num_sg; }; #define AGP_NORMAL_MEMORY 0 -- cgit v1.2.3-71-gd317 From f692775d7e0a22477143cd884e45c955448ac7d2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 09:28:45 +0100 Subject: intel-agp: fix sglist allocation to avoid vmalloc() Signed-off-by: David Woodhouse --- drivers/char/agp/intel-agp.c | 29 ++++++++++------------------- include/linux/agp_backend.h | 1 - 2 files changed, 10 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index b9d9886ff3c3..d8c80d8be5e2 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -198,39 +198,30 @@ static void intel_agp_unmap_page(struct page *page, dma_addr_t dma) static void intel_agp_free_sglist(struct agp_memory *mem) { + struct sg_table st; + + st.sgl = mem->sg_list; + st.orig_nents = st.nents = mem->page_count; + + sg_free_table(&st); - if (mem->sg_vmalloc_flag) - vfree(mem->sg_list); - else - kfree(mem->sg_list); - mem->sg_vmalloc_flag = 0; mem->sg_list = NULL; mem->num_sg = 0; } static int intel_agp_map_memory(struct agp_memory *mem) { + struct sg_table st; struct scatterlist *sg; int i; DBG("try mapping %lu pages\n", (unsigned long)mem->page_count); - if ((mem->page_count * sizeof(*mem->sg_list)) < 2*PAGE_SIZE) - mem->sg_list = kcalloc(mem->page_count, sizeof(*mem->sg_list), - GFP_KERNEL); - - if (mem->sg_list == NULL) { - mem->sg_list = vmalloc(mem->page_count * sizeof(*mem->sg_list)); - mem->sg_vmalloc_flag = 1; - } - - if (!mem->sg_list) { - mem->sg_vmalloc_flag = 0; + if (sg_alloc_table(&st, mem->page_count, GFP_KERNEL)) return -ENOMEM; - } - sg_init_table(mem->sg_list, mem->page_count); - sg = mem->sg_list; + mem->sg_list = sg = st.sgl; + for (i = 0 ; i < mem->page_count; i++, sg = sg_next(sg)) sg_set_page(sg, mem->pages[i], PAGE_SIZE, 0); diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h index 8a294d65b9b1..880130f7311f 100644 --- a/include/linux/agp_backend.h +++ b/include/linux/agp_backend.h @@ -80,7 +80,6 @@ struct agp_memory { bool is_bound; bool is_flushed; bool vmalloc_flag; - bool sg_vmalloc_flag; /* list of agp_memory mapped to the aperture */ struct list_head mapped_list; /* DMA-mapped addresses */ -- cgit v1.2.3-71-gd317 From 42c4ab41a176ee784c0f28c0b29025a8fc34f05a Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:26 +0200 Subject: itimers: Merge ITIMER_VIRT and ITIMER_PROF Both cpu itimers have same data flow in the few places, this patch make unification of code related with VIRT and PROF itimers. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-2-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 ++++- kernel/fork.c | 9 +-- kernel/itimer.c | 146 +++++++++++++++++++++------------------------- kernel/posix-cpu-timers.c | 98 +++++++++++++++---------------- 4 files changed, 130 insertions(+), 137 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..3b3efaddd953 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -470,6 +470,11 @@ struct pacct_struct { unsigned long ac_minflt, ac_majflt; }; +struct cpu_itimer { + cputime_t expires; + cputime_t incr; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units @@ -564,9 +569,12 @@ struct signal_struct { struct pid *leader_pid; ktime_t it_real_incr; - /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ - cputime_t it_prof_expires, it_virt_expires; - cputime_t it_prof_incr, it_virt_incr; + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f7..893ab0bf5e39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -790,10 +791,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7077ec..852c88ddd1f0 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -41,10 +41,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = jiffies_to_cputime(1); + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +88,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -128,6 +127,36 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *value, struct itimerval *ovalue) +{ + cputime_t cval, cinterval, nval, ninterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, jiffies_to_cputime(1)); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +168,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -174,48 +202,10 @@ again: spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bece7c0b67b2..9b2d5e4dc8c4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -14,11 +14,11 @@ */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -613,6 +613,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -620,30 +623,27 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, + if (!cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + cputime_lt(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, + if (!cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + cputime_lt(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1070,6 +1070,27 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + it->expires = it->incr; + if (!cputime_eq(it->expires, cputime_zero)) + it->expires = cputime_add(it->expires, cur_time); + + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1089,10 +1110,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1152,38 +1173,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; -- cgit v1.2.3-71-gd317 From 8356b5f9c424e5831715abbce747197c30d1fd71 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 29 Jul 2009 12:15:27 +0200 Subject: itimers: Fix periodic tics precision Measure ITIMER_PROF and ITIMER_VIRT timers interval error between real ticks and requested by user. Take it into account when scheduling next tick. This patch introduce possibility where time between two consecutive tics is smaller then requested interval, it preserve however dependency that n tick is generated not earlier than n*interval time - counting from the beginning of periodic signal generation. Signed-off-by: Stanislaw Gruszka Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <1248862529-6063-3-git-send-email-sgruszka@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/itimer.c | 24 +++++++++++++++++++++--- kernel/posix-cpu-timers.c | 20 +++++++++++++++++--- 3 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b3efaddd953..a069e65e8bb7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -473,6 +473,8 @@ struct pacct_struct { struct cpu_itimer { cputime_t expires; cputime_t incr; + u32 error; + u32 incr_error; }; /** diff --git a/kernel/itimer.c b/kernel/itimer.c index 852c88ddd1f0..21adff7b2a17 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -42,7 +42,7 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *value) + struct itimerval *const value) { cputime_t cval, cinterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -127,14 +127,32 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *value, struct itimerval *ovalue) + const struct itimerval *const value, + struct itimerval *const ovalue) { - cputime_t cval, cinterval, nval, ninterval; + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); spin_lock_irq(&tsk->sighand->siglock); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9b2d5e4dc8c4..b60d644ea4b3 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1070,6 +1070,8 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_t *expires, cputime_t cur_time, int signo) { @@ -1077,9 +1079,16 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, return; if (cputime_ge(cur_time, it->expires)) { - it->expires = it->incr; - if (!cputime_eq(it->expires, cputime_zero)) - it->expires = cputime_add(it->expires, cur_time); + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + jiffies_to_cputime(1)); + it->error -= onecputick; + } + } else + it->expires = cputime_zero; __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } @@ -1696,10 +1705,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(jiffies_to_cputime(1), &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); -- cgit v1.2.3-71-gd317 From bbff2e433e80fae72c8d00d482927d52ec19ba33 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 6 Aug 2009 11:36:25 +0300 Subject: slab: remove duplicate kmem_cache_init_late() declarations kmem_cache_init_late() has been declared in slab.h CC: Nick Piggin CC: Matt Mackall CC: Christoph Lameter Signed-off-by: Wu Fengguang Signed-off-by: Pekka Enberg --- include/linux/slob_def.h | 5 ----- include/linux/slub_def.h | 2 -- mm/slob.c | 5 +++++ 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index bb5368df4be8..0ec00b39d006 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -34,9 +34,4 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags) return kmalloc(size, flags); } -static inline void kmem_cache_init_late(void) -{ - /* Nothing to do */ -} - #endif /* __LINUX_SLOB_DEF_H */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index c1c862b1d01a..76c831693616 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -304,6 +304,4 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) } #endif -void __init kmem_cache_init_late(void); - #endif /* _LINUX_SLUB_DEF_H */ diff --git a/mm/slob.c b/mm/slob.c index 9641da3d5e58..837ebd64cc34 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -692,3 +692,8 @@ void __init kmem_cache_init(void) { slob_ready = 1; } + +void __init kmem_cache_init_late(void) +{ + /* Nothing to do */ +} -- cgit v1.2.3-71-gd317 From fa56d4cb4022c8b313c3b99236e1b87effc3655b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 23 Jun 2009 11:29:11 +0000 Subject: ide: allow ide_dev_read_id() to be called from the IRQ context * Un-static __ide_wait_stat(). * Allow ide_dev_read_id() helper to be called from the IRQ context by adding irq_ctx flag and using mdelay()/__ide_wait_stat() when needed. * Switch ide_driveid_update() to set irq_ctx flag. This change is needed for the consecutive patch which fixes races in handling of user-space SET XFER commands but for improved bisectability and clarity it is better to do it in a separate patch. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-iops.c | 6 +++--- drivers/ide/ide-probe.c | 31 +++++++++++++++++++++---------- include/linux/ide.h | 3 ++- 3 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 2892b242bbe1..b99873845d21 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -102,8 +102,8 @@ EXPORT_SYMBOL(ide_fixstring); * setting a timer to wake up at half second intervals thereafter, * until timeout is achieved, before timing out. */ -static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, - unsigned long timeout, u8 *rstat) +int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, + unsigned long timeout, u8 *rstat) { ide_hwif_t *hwif = drive->hwif; const struct ide_tp_ops *tp_ops = hwif->tp_ops; @@ -316,7 +316,7 @@ int ide_driveid_update(ide_drive_t *drive) return 0; SELECT_MASK(drive, 1); - rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id); + rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id, 1); SELECT_MASK(drive, 0); if (rc) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 1bb106f6221a..8de442cbee94 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -238,6 +238,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) * @drive: drive to identify * @cmd: command to use * @id: buffer for IDENTIFY data + * @irq_ctx: flag set when called from the IRQ context * * Sends an ATA(PI) IDENTIFY request to a drive and waits for a response. * @@ -246,7 +247,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) * 2 device aborted the command (refused to identify itself) */ -int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) +int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id, int irq_ctx) { ide_hwif_t *hwif = drive->hwif; struct ide_io_ports *io_ports = &hwif->io_ports; @@ -263,7 +264,10 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS); /* take a deep breath */ - msleep(50); + if (irq_ctx) + mdelay(50); + else + msleep(50); if (io_ports->ctl_addr && (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) { @@ -295,12 +299,19 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2; - if (ide_busy_sleep(drive, timeout, use_altstatus)) - return 1; - /* wait for IRQ and ATA_DRQ */ - msleep(50); - s = tp_ops->read_status(hwif); + if (irq_ctx) { + rc = __ide_wait_stat(drive, ATA_DRQ, BAD_R_STAT, timeout, &s); + if (rc) + return 1; + } else { + rc = ide_busy_sleep(drive, timeout, use_altstatus); + if (rc) + return 1; + + msleep(50); + s = tp_ops->read_status(hwif); + } if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) { /* drive returned ID */ @@ -406,10 +417,10 @@ static int do_probe (ide_drive_t *drive, u8 cmd) if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) || present || cmd == ATA_CMD_ID_ATAPI) { - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); if (rc) /* failed: try again */ - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); stat = tp_ops->read_status(hwif); @@ -424,7 +435,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) msleep(50); tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET); (void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0); - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); } /* ensure drive IRQ is clear */ diff --git a/include/linux/ide.h b/include/linux/ide.h index edc93a6d931d..cb6cd0459a5e 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1081,6 +1081,7 @@ extern void ide_fixstring(u8 *, const int, const int); int ide_busy_sleep(ide_drive_t *, unsigned long, int); +int __ide_wait_stat(ide_drive_t *, u8, u8, unsigned long, u8 *); int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long); ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *); @@ -1169,7 +1170,7 @@ int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *); int ide_taskfile_ioctl(ide_drive_t *, unsigned long); -int ide_dev_read_id(ide_drive_t *, u8, u16 *); +int ide_dev_read_id(ide_drive_t *, u8, u16 *, int); extern int ide_driveid_update(ide_drive_t *); extern int ide_config_drive_speed(ide_drive_t *, u8); -- cgit v1.2.3-71-gd317 From 665d66e8fad60a5a162c4615f27f916ad1a6d567 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 23 Jun 2009 11:35:51 +0000 Subject: ide: fix races in handling of user-space SET XFER commands * Make cmd->tf_flags field 'u16' and add IDE_TFLAG_SET_XFER taskfile flag. * Update ide_finish_cmd() to set xfer / re-read id if the new flag is set. * Convert set_xfer_rate() (write handler for /proc/ide/hd?/current_speed) and ide_cmd_ioctl() (HDIO_DRIVE_CMD ioctl handler) to use the new flag. * Remove no longer needed disable_irq_nosync() + enable_irq() from ide_config_drive_speed(). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-ioctls.c | 8 ++------ drivers/ide/ide-iops.c | 10 ---------- drivers/ide/ide-proc.c | 10 ++-------- drivers/ide/ide-taskfile.c | 9 ++++++++- include/linux/ide.h | 3 ++- 5 files changed, 14 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index e246d3d3fbcc..d3440b5010a5 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -167,6 +167,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) err = -EINVAL; goto abort; } + + cmd.tf_flags |= IDE_TFLAG_SET_XFER; } err = ide_raw_taskfile(drive, &cmd, buf, args[3]); @@ -174,12 +176,6 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) args[0] = tf->status; args[1] = tf->error; args[2] = tf->nsect; - - if (!err && xfer_rate) { - /* active-retuning-calls future */ - ide_set_xfer_rate(drive, xfer_rate); - ide_driveid_update(drive); - } abort: if (copy_to_user((void __user *)arg, &args, 4)) err = -EFAULT; diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index b99873845d21..b14fa9a87c49 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -363,14 +363,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) * this point (lost interrupt). */ - /* - * FIXME: we race against the running IRQ here if - * this is called from non IRQ context. If we use - * disable_irq() we hang on the error path. Work - * is needed. - */ - disable_irq_nosync(hwif->irq); - udelay(1); tp_ops->dev_select(drive); SELECT_MASK(drive, 1); @@ -394,8 +386,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) SELECT_MASK(drive, 0); - enable_irq(hwif->irq); - if (error) { (void) ide_dump_status(drive, "set_drive_speed_status", stat); return error; diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 3242698832a4..021de41655e6 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -195,7 +195,6 @@ ide_devset_get(xfer_rate, current_speed); static int set_xfer_rate (ide_drive_t *drive, int arg) { struct ide_cmd cmd; - int err; if (arg < XFER_PIO_0 || arg > XFER_UDMA_6) return -EINVAL; @@ -206,14 +205,9 @@ static int set_xfer_rate (ide_drive_t *drive, int arg) cmd.tf.nsect = (u8)arg; cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT; cmd.valid.in.tf = IDE_VALID_NSECT; + cmd.tf_flags = IDE_TFLAG_SET_XFER; - err = ide_no_data_taskfile(drive, &cmd); - - if (!err) { - ide_set_xfer_rate(drive, (u8) arg); - ide_driveid_update(drive); - } - return err; + return ide_no_data_taskfile(drive, &cmd); } ide_devset_rw(current_speed, xfer_rate); diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 50336d51eebc..cc8633cbe133 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -324,10 +324,17 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat) { struct request *rq = drive->hwif->rq; - u8 err = ide_read_error(drive); + u8 err = ide_read_error(drive), nsect = cmd->tf.nsect; + u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER); ide_complete_cmd(drive, cmd, stat, err); rq->errors = err; + + if (err == 0 && set_xfer) { + ide_set_xfer_rate(drive, nsect); + ide_driveid_update(drive); + } + ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); } diff --git a/include/linux/ide.h b/include/linux/ide.h index cb6cd0459a5e..803c1ae31237 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -258,6 +258,7 @@ enum { IDE_TFLAG_DYN = (1 << 5), IDE_TFLAG_FS = (1 << 6), IDE_TFLAG_MULTI_PIO = (1 << 7), + IDE_TFLAG_SET_XFER = (1 << 8), }; enum { @@ -294,7 +295,7 @@ struct ide_cmd { } out, in; } valid; - u8 tf_flags; + u16 tf_flags; u8 ftf_flags; /* for TASKFILE ioctl */ int protocol; -- cgit v1.2.3-71-gd317 From 25a70e38cd57952b09a013bf070e03705ee4f2ff Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 12 Aug 2009 00:50:09 -0700 Subject: Input: eeti_ts - allow active high irq lines This adds a struct eeti_ts_platform_data which currently holds only one value to specify the interrupt polarity. The driver has a fallback if no platform data is passed in via the i2c_board_info, so no regression is caused. Signed-off-by: Daniel Mack Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/eeti_ts.c | 22 +++++++++++++++++++--- include/linux/input/eeti_ts.h | 9 +++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 include/linux/input/eeti_ts.h (limited to 'include/linux') diff --git a/drivers/input/touchscreen/eeti_ts.c b/drivers/input/touchscreen/eeti_ts.c index 3ab92222a525..9029bd3f34e5 100644 --- a/drivers/input/touchscreen/eeti_ts.c +++ b/drivers/input/touchscreen/eeti_ts.c @@ -32,6 +32,7 @@ #include #include #include +#include static int flip_x; module_param(flip_x, bool, 0644); @@ -46,7 +47,7 @@ struct eeti_ts_priv { struct input_dev *input; struct work_struct work; struct mutex mutex; - int irq; + int irq, irq_active_high; }; #define EETI_TS_BITDEPTH (11) @@ -58,6 +59,11 @@ struct eeti_ts_priv { #define REPORT_BIT_HAS_PRESSURE (1 << 6) #define REPORT_RES_BITS(v) (((v) >> 1) + EETI_TS_BITDEPTH) +static inline int eeti_ts_irq_active(struct eeti_ts_priv *priv) +{ + return gpio_get_value(irq_to_gpio(priv->irq)) == priv->irq_active_high; +} + static void eeti_ts_read(struct work_struct *work) { char buf[6]; @@ -67,7 +73,7 @@ static void eeti_ts_read(struct work_struct *work) mutex_lock(&priv->mutex); - while (!gpio_get_value(irq_to_gpio(priv->irq)) && --to) + while (eeti_ts_irq_active(priv) && --to) i2c_master_recv(priv->client, buf, sizeof(buf)); if (!to) { @@ -140,8 +146,10 @@ static void eeti_ts_close(struct input_dev *dev) static int __devinit eeti_ts_probe(struct i2c_client *client, const struct i2c_device_id *idp) { + struct eeti_ts_platform_data *pdata; struct eeti_ts_priv *priv; struct input_dev *input; + unsigned int irq_flags; int err = -ENOMEM; /* In contrast to what's described in the datasheet, there seems @@ -180,6 +188,14 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, priv->input = input; priv->irq = client->irq; + pdata = client->dev.platform_data; + + if (pdata) + priv->irq_active_high = pdata->irq_active_high; + + irq_flags = priv->irq_active_high ? + IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING; + INIT_WORK(&priv->work, eeti_ts_read); i2c_set_clientdata(client, priv); input_set_drvdata(input, priv); @@ -188,7 +204,7 @@ static int __devinit eeti_ts_probe(struct i2c_client *client, if (err) goto err1; - err = request_irq(priv->irq, eeti_ts_isr, IRQF_TRIGGER_FALLING, + err = request_irq(priv->irq, eeti_ts_isr, irq_flags, client->name, priv); if (err) { dev_err(&client->dev, "Unable to request touchscreen IRQ.\n"); diff --git a/include/linux/input/eeti_ts.h b/include/linux/input/eeti_ts.h new file mode 100644 index 000000000000..f875b316249d --- /dev/null +++ b/include/linux/input/eeti_ts.h @@ -0,0 +1,9 @@ +#ifndef LINUX_INPUT_EETI_TS_H +#define LINUX_INPUT_EETI_TS_H + +struct eeti_ts_platform_data { + unsigned int irq_active_high; +}; + +#endif /* LINUX_INPUT_EETI_TS_H */ + -- cgit v1.2.3-71-gd317 From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: rename 4k first chunk allocator to page Page size isn't always 4k depending on arch and configuration. Rename 4k first chunk allocator to page. Signed-off-by: Tejun Heo Cc: David Howells --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup_percpu.c | 23 ++++++++++++----------- include/linux/percpu.h | 2 +- mm/percpu.c | 25 ++++++++++++++----------- 4 files changed, 28 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6a..12e9eb77ee0d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "4k". + Allowed values are one of "lpage", "embed" and "page". See comments in arch/x86/kernel/setup_percpu.c for details on each allocator. This parameter is primarily for debugging and performance comparison. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a26ff61e2fb0..1e17711c29d6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k allocator + * Page allocator * - * Boring fallback 4k allocator. This allocator puts more pressure on - * PTE TLBs but other than that behaves nicely on both UMA and NUMA. + * Boring fallback 4k page allocator. This allocator puts more + * pressure on PTE TLBs but other than that behaves nicely on both UMA + * and NUMA. */ -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) +static ssize_t __init setup_pcpu_page(size_t static_size) { - return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpu4k_populate_pte); + return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); } /* for explicit first chunk allocator selection */ @@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void) */ ret = -EINVAL; if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { + if (strcmp(pcpu_chosen_alloc, "page")) { if (!strcmp(pcpu_chosen_alloc, "lpage")) ret = setup_pcpu_lpage(static_size, true); else if (!strcmp(pcpu_chosen_alloc, "embed")) @@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void) "specified\n", pcpu_chosen_alloc); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", + "falling back to page size\n", pcpu_chosen_alloc, ret); } } else { @@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void) ret = setup_pcpu_embed(static_size, false); } if (ret < 0) - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_page(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e134c8229631..7989f61b03f3 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); -extern ssize_t __init pcpu_4k_first_chunk( +extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, diff --git a/mm/percpu.c b/mm/percpu.c index cbddcbdab681..6feac7934904 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /** - * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * - * This is a helper to ease setting up embedded first percpu chunk and - * can be called where pcpu_setup_first_chunk() is expected. + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. @@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; @@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, int i, j; ssize_t ret; + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, PCPU_MIN_UNIT_SIZE)); @@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); goto enomem; } pages[j++] = virt_to_page(ptr); @@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n", - unit_pages, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr, NULL); -- cgit v1.2.3-71-gd317 From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: build first chunk allocators selectively There's no need to build unused first chunk allocators in. Define CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them selectively. Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 10 ++++++++++ include/linux/percpu.h | 27 +++++---------------------- mm/percpu.c | 19 +++++++++++-------- 3 files changed, 26 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e06b2eeff9f2..f7ac27215512 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_LPAGE_FIRST_CHUNK + def_bool y + depends on NEED_MULTIPLE_NODES + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 7989f61b03f3..e26788e0da4a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk( ssize_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( size_t static_size, size_t reserved_size, ssize_t *dyn_sizep, size_t *unit_sizep, @@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk( extern void *pcpu_lpage_remapped(void *kaddr); #else -static inline int pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - return -EINVAL; -} - -static inline ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - return -EINVAL; -} - static inline void *pcpu_lpage_remapped(void *kaddr) { return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index 6feac7934904..7971997de310 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } -static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep) +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) { size_t size_sum; @@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); } +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes @@ -1598,12 +1604,9 @@ out_free_ar: free_bootmem(__pa(pages), pages_size); return ret; } +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -/* - * Large page remapping first chunk setup helper - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES - +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping * @static_size: the size of static percpu area in bytes @@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } -#endif +#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ /* * Generic percpu area setup. -- cgit v1.2.3-71-gd317 From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: generalize first chunk allocator selection Now that all first chunk allocators are in mm/percpu.c, it makes sense to make generalize percpu_alloc kernel parameter. Define PCPU_FC_* and set pcpu_chosen_fc using early_param() in mm/percpu.c. Arch code can use the set value to determine which first chunk allocator to use. Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 11 ++++++----- arch/x86/kernel/setup_percpu.c | 24 ++++++------------------ include/linux/percpu.h | 12 ++++++++++++ mm/percpu.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 12e9eb77ee0d..dee9ce2e6cfa 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "page". - See comments in arch/x86/kernel/setup_percpu.c for - details on each allocator. This parameter is primarily - for debugging and performance comparison. + percpu_alloc= Select which percpu first chunk allocator to use. + Currently supported values are "embed", "page" and + "lpage". Archs may support subset or none of the + selections. See comments in mm/percpu.c for details + on each allocator. This parameter is primarily for + debugging and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1e17711c29d6..b961d99e6416 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size) pcpup_populate_pte); } -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void) * each allocator for details. */ ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "page")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) + if (pcpu_chosen_fc != PCPU_FC_AUTO) { + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + if (pcpu_chosen_fc == PCPU_FC_LPAGE) ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); + ret = setup_pcpu_embed(static_size, true); + if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " "falling back to page size\n", - pcpu_chosen_alloc, ret); + pcpu_fc_names[pcpu_chosen_fc], ret); } } else { ret = setup_pcpu_lpage(static_size, false); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e26788e0da4a..9be05cbe5ee0 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,18 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +enum pcpu_fc { + PCPU_FC_AUTO, + PCPU_FC_EMBED, + PCPU_FC_PAGE, + PCPU_FC_LPAGE, + + PCPU_FC_NR, +}; +extern const char *pcpu_fc_names[PCPU_FC_NR]; + +extern enum pcpu_fc pcpu_chosen_fc; + typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); diff --git a/mm/percpu.c b/mm/percpu.c index 7971997de310..7fb40bb1555a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", + [PCPU_FC_LPAGE] = "lpage", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK + else if (!strcmp(str, "lpage")) + pcpu_chosen_fc = PCPU_FC_LPAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) -- cgit v1.2.3-71-gd317 From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: drop @static_size from first chunk allocators First chunk allocators assume percpu areas have been linked using one of PERCPU_*() macros and depend on __per_cpu_load symbol defined by those macros, so there isn't much point in passing in static area size explicitly when it can be easily calculated from __per_cpu_start and __per_cpu_end. Drop @static_size from all percpu first chunk allocators and helpers. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++------------------- include/linux/percpu.h | 18 ++++++++---------- mm/percpu.c | 29 +++++++++++++---------------- 3 files changed, 36 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b961d99e6416..8aad486c688f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; @@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -ENOMEM; } - ret = pcpu_lpage_build_unit_map(static_size, - PERCPU_FIRST_CHUNK_RESERVE, + ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, &dyn_size, &unit_size, PMD_SIZE, unit_map, pcpu_lpage_cpu_distance); if (ret < 0) { @@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) } } - ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - dyn_size, unit_size, PMD_SIZE, - unit_map, nr_units, + ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + unit_size, PMD_SIZE, unit_map, nr_units, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: if (ret < 0) @@ -218,7 +216,7 @@ out_free: return ret; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, reserve - PERCPU_FIRST_CHUNK_RESERVE); } @@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(size_t static_size) +static ssize_t __init setup_pcpu_page(void) { - return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); } @@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; @@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(static_size, true); + ret = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(static_size, true); + ret = setup_pcpu_embed(true); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " @@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void) pcpu_fc_names[pcpu_chosen_fc], ret); } } else { - ret = setup_pcpu_lpage(static_size, false); + ret = setup_pcpu_lpage(false); if (ret < 0) - ret = setup_pcpu_embed(static_size, false); + ret = setup_pcpu_embed(false); } if (ret < 0) - ret = setup_pcpu_page(static_size); + ret = setup_pcpu_page(); if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); + panic("cannot initialize percpu area (err=%zd)", ret); pcpu_unit_size = ret; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9be05cbe5ee0..be2fc8fb9b6f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk( #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( - size_t static_size, size_t reserved_size, - ssize_t dyn_size); + size_t reserved_size, ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( - size_t static_size, size_t reserved_size, + size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); @@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); extern ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 7fb40bb1555a..e2ac58a39bb2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size) +ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t size_sum, unit_size, chunk_size; void *base; unsigned int cpu; @@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE @@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, +ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1641,7 +1640,6 @@ out_free_ar: #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_sizep: in/out parameter for dynamic size, -1 for auto * @unit_sizep: out parameter for unit size @@ -1661,13 +1659,14 @@ out_free_ar: * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and * returns the number of units to be allocated. -errno on failure. */ -int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; int group_cnt_max = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ @@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes @@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, +ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; @@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; ssize_t unit_size; unsigned long delta; unsigned int cpu; @@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-71-gd317 From 1d9d32572163b30be81dbe1409dfa7ea9763d0e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: make @dyn_size mandatory for pcpu_setup_first_chunk() Now that all actual first chunk allocation and copying happen in the first chunk allocators and helpers, there's no reason for pcpu_setup_first_chunk() to try to determine @dyn_size automatically. The only left user is page first chunk allocator. Make it determine dyn_size like other allocators and make @dyn_size mandatory for pcpu_setup_first_chunk(). Signed-off-by: Tejun Heo --- include/linux/percpu.h | 2 +- mm/percpu.c | 39 +++++++++++++++++++-------------------- 2 files changed, 20 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index be2fc8fb9b6f..0cfdd14d096a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -79,7 +79,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t unit_size, + size_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index e2ac58a39bb2..287f59cc5fb9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1235,7 +1235,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address * @unit_map: cpu -> unit map, NULL for sequential mapping @@ -1252,10 +1252,9 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @static_size + + * @reserved_size + @dyn_size and @unit_size is unused. * * @unit_size specifies unit size and must be aligned to PAGE_SIZE and * equal to or larger than @static_size + @reserved_size + if @@ -1276,13 +1275,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * percpu access. */ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t unit_size, + size_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + size_t size_sum = static_size + reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu, tcpu; int i; @@ -1345,9 +1343,6 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; - first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; first_vm.addr = base_addr; @@ -1557,6 +1552,8 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, { static struct vm_struct vm; const size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t dyn_size = -1; + size_t size_sum, unit_size; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1567,8 +1564,9 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, - PCPU_MIN_UNIT_SIZE)); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + unit_pages = unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); @@ -1591,12 +1589,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT; + vm.size = nr_cpu_ids * unit_size; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { - unsigned long unit_addr = (unsigned long)vm.addr + - (cpu * unit_pages << PAGE_SHIFT); + unsigned long unit_addr = + (unsigned long)vm.addr + cpu * unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); @@ -1620,11 +1618,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size, + dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, - unit_pages << PAGE_SHIFT, vm.addr, NULL); + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + unit_size, vm.addr, NULL); goto out_free_ar; enomem: -- cgit v1.2.3-71-gd317 From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: add @align to pcpu_fc_alloc_fn_t pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align parameter. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 ++-- include/linux/percpu.h | 3 ++- mm/percpu.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8aad486c688f..660cde133141 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return pcpu_alloc_bootmem(cpu, size, size); + return pcpu_alloc_bootmem(cpu, size, align); } static void __init pcpu_fc_free(void *ptr, size_t size) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 0cfdd14d096a..d385dbcf190b 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, + size_t align); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); diff --git a/mm/percpu.c b/mm/percpu.c index 287f59cc5fb9..3316e3aac7ee 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate %s page " "for cpu%u\n", psize_str, cpu); @@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, goto found; continue; found: - ptr = alloc_fn(cpu, lpage_size); + ptr = alloc_fn(cpu, lpage_size, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " "for cpu%u\n", cpu); -- cgit v1.2.3-71-gd317 From 033e48fb82958053113178264ddb9d5038d5e38b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg() upward Unit map handling will be generalized and extended and used for embedding sparse first chunk and other purposes. Relocate two unit_map related functions upward in preparation. This patch just moves the code without any actual change. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 14 +- mm/percpu.c | 339 +++++++++++++++++++++++++------------------------ 2 files changed, 180 insertions(+), 173 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d385dbcf190b..570fb18de2ba 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -78,6 +78,14 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK +extern int __init pcpu_lpage_build_unit_map( + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); +#endif + extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, size_t dyn_size, size_t unit_size, @@ -97,12 +105,6 @@ extern ssize_t __init pcpu_page_first_chunk( #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn); - extern ssize_t __init pcpu_lpage_first_chunk( size_t reserved_size, size_t dyn_size, size_t unit_size, size_t lpage_size, diff --git a/mm/percpu.c b/mm/percpu.c index 3316e3aac7ee..2b9c4b2a2fc0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1231,6 +1231,178 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup); + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} +#endif + /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @static_size: the size of static percpu area in bytes @@ -1441,20 +1613,6 @@ static int __init percpu_alloc_setup(char *str) } early_param("percpu_alloc", percpu_alloc_setup); -static inline size_t pcpu_calc_fc_sizes(size_t static_size, - size_t reserved_size, - ssize_t *dyn_sizep) -{ - size_t size_sum; - - size_sum = PFN_ALIGN(static_size + reserved_size + - (*dyn_sizep >= 0 ? *dyn_sizep : 0)); - if (*dyn_sizep != 0) - *dyn_sizep = size_sum - static_size - reserved_size; - - return size_sum; -} - #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** @@ -1637,122 +1795,6 @@ out_free_ar: #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -/** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus - * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. - * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. - * - * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. - */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - static int group_map[NR_CPUS] __initdata; - static int group_cnt[NR_CPUS] __initdata; - const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; - size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; - unsigned int cpu, tcpu; - int group, unit; - - /* - * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest - * which can accomodate 4k aligned segments which are equal to - * or larger than min_unit_size. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); - min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - - alloc_size = roundup(min_unit_size, lpage_size); - upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - upa--; - max_upa = upa; - - /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - goto next_group; - } - } - group_map[cpu] = group; - group_cnt[group]++; - group_cnt_max = max(group_cnt_max, group_cnt[group]); - } - - /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. - */ - last_allocs = INT_MAX; - for (upa = max_upa; upa; upa--) { - int allocs = 0, wasted = 0; - - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - continue; - - for (group = 0; group_cnt[group]; group++) { - int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); - allocs += this_allocs; - wasted += this_allocs * upa - group_cnt[group]; - } - - /* - * Don't accept if wastage is over 25%. The - * greater-than comparison ensures upa==1 always - * passes the following check. - */ - if (wasted > num_possible_cpus() / 3) - continue; - - /* and then don't consume more memory */ - if (allocs > last_allocs) - break; - last_allocs = allocs; - best_upa = upa; - } - *unit_sizep = alloc_size / best_upa; - - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); - } - - return unit; /* unit contains aligned number of units */ -} - struct pcpul_ent { void *ptr; void *map_addr; @@ -1778,43 +1820,6 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, return false; } -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) -{ - int width = 1, v = nr_units; - char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; - - while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; - - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); - - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); - - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { - printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); - } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); - } - printk("\n"); -} - /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @reserved_size: the size of reserved percpu area in bytes -- cgit v1.2.3-71-gd317 From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info Till now, non-linear cpu->unit map was expressed using an integer array which maps each cpu to a unit and used only by lpage allocator. Although how many units have been placed in a single contiguos area (group) is known while building unit_map, the information is lost when the result is recorded into the unit_map array. For lpage allocator, as all allocations are done by lpages and whether two adjacent lpages are in the same group or not is irrelevant, this didn't cause any problem. Non-linear cpu->unit mapping will be used for sparse embedding and this grouping information is necessary for that. This patch introduces pcpu_alloc_info which contains all the information necessary for initializing percpu allocator. pcpu_alloc_info contains array of pcpu_group_info which describes how units are grouped and mapped to cpus. pcpu_group_info also has base_offset field to specify its offset from the chunk's base address. pcpu_build_alloc_info() initializes this field as if all groups are allocated back-to-back as is currently done but this will be used to sparsely place groups. pcpu_alloc_info is a rather complex data structure which contains a flexible array which in turn points to nested cpu_map arrays. * pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to help dealing with pcpu_alloc_info. * pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info, generalized and renamed to pcpu_build_alloc_info(). @cpu_distance_fn may be NULL indicating that all cpus are of LOCAL_DISTANCE. * pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info, generalized and renamed to pcpu_dump_alloc_info(). It now also prints which group each alloc unit belongs to. * pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the separate parameters. All first chunk allocators are updated to use pcpu_build_alloc_info() to build alloc_info and call pcpu_setup_first_chunk() with it. This has the side effect of packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4. * x86 setup_pcpu_lpage() is updated to deal with alloc_info. * sparc64 setup_per_cpu_areas() is updated to build alloc_info. Although the changes made by this patch are pretty pervasive, it doesn't cause any behavior difference other than packing of sparse cpus. It mostly changes how information is passed among initialization functions and makes room for more flexibility. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 24 +- arch/x86/kernel/setup_percpu.c | 38 ++- include/linux/percpu.h | 42 +++- mm/percpu.c | 529 +++++++++++++++++++++++++---------------- 4 files changed, 389 insertions(+), 244 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 9856d866b77b..a42a4a744d14 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end, void __init setup_per_cpu_areas(void) { - size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; + struct pcpu_alloc_info *ai; unsigned long delta, cpu; size_t size_sum, pcpu_unit_size; size_t ptrs_size; void **ptrs; - size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); + + ai->static_size = __per_cpu_end - __per_cpu_start; + ai->reserved_size = PERCPU_MODULE_RESERVE; + + size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size + PERCPU_DYNAMIC_RESERVE); - dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; + ai->dyn_size = size_sum - ai->static_size - ai->reserved_size; + ai->unit_size = PCPU_CHUNK_SIZE; + ai->atom_size = PCPU_CHUNK_SIZE; + ai->alloc_size = PCPU_CHUNK_SIZE; + ai->groups[0].nr_units = nr_cpu_ids; + + for_each_possible_cpu(cpu) + ai->groups[0].cpu_map[cpu] = cpu; ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0])); ptrs = alloc_bootmem(ptrs_size); @@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void) free_bootmem(__pa(ptrs[cpu] + size_sum), PCPU_CHUNK_SIZE - size_sum); - memcpy(ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, ai->static_size); } /* allocate address and map */ @@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(static_size, - PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 660cde133141..db5f9c49fec5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - size_t unit_map_size, unit_size; - int *unit_map; - int nr_units; + struct pcpu_alloc_info *ai; ssize_t ret; /* on non-NUMA, embedding is better */ @@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } /* allocate and build unit_map */ - unit_map_size = nr_cpu_ids * sizeof(int); - unit_map = alloc_bootmem_nopanic(unit_map_size); - if (!unit_map) { - pr_warning("PERCPU: failed to allocate unit_map\n"); - return -ENOMEM; + ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, pcpu_lpage_cpu_distance); + if (IS_ERR(ai)) { + pr_warning("PERCPU: failed to build unit_map (%ld)\n", + PTR_ERR(ai)); + return PTR_ERR(ai); } - ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, - &dyn_size, &unit_size, PMD_SIZE, - unit_map, pcpu_lpage_cpu_distance); - if (ret < 0) { - pr_warning("PERCPU: failed to build unit_map\n"); - goto out_free; - } - nr_units = ret; - /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_units * unit_size; + size_t tot_size = 0; + int group; + + for (group = 0; group < ai->nr_groups; group++) + tot_size += ai->unit_size * ai->groups[group].nr_units; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { @@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } } - ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - unit_size, PMD_SIZE, unit_map, nr_units, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, + pcpul_map); out_free: - if (ret < 0) - free_bootmem(__pa(unit_map), unit_map_size); + pcpu_free_alloc_info(ai); return ret; } #else diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 570fb18de2ba..77b86be8ce4f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,25 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +struct pcpu_group_info { + int nr_units; /* aligned # of units */ + unsigned long base_offset; /* base address offset */ + unsigned int *cpu_map; /* unit->cpu map, empty + * entries contain NR_CPUS */ +}; + +struct pcpu_alloc_info { + size_t static_size; + size_t reserved_size; + size_t dyn_size; + size_t unit_size; + size_t atom_size; + size_t alloc_size; + size_t __ai_size; /* internal, don't use */ + int nr_groups; /* 0 if grouping unnecessary */ + struct pcpu_group_info groups[]; +}; + enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, @@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, +extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units); +extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); + +extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -#endif -extern size_t __init pcpu_setup_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map); +extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( @@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern ssize_t __init pcpu_lpage_first_chunk( - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, + const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 2b9c4b2a2fc0..99f7fa682722 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include #include @@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, return size_sum; } -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. * * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; + int last_allocs, group, unit; unsigned int cpu, tcpu; - int group, unit; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; /* * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest + * alloc_size is multiple of atom_size and is the smallest * which can accomodate 4k aligned segments which are equal to * or larger than min_unit_size. */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - alloc_size = roundup(min_unit_size, lpage_size); + alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) upa--; @@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; - if (group_map[tcpu] == group && + if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; + nr_groups = max(nr_groups, group + 1); goto next_group; } } @@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) continue; - for (group = 0; group_cnt[group]; group++) { + for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; @@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, last_allocs = allocs; best_upa = upa; } - *unit_sizep = alloc_size / best_upa; + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { for_each_possible_cpu(cpu) if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; } + BUG_ON(unit != nr_units); - return unit; /* unit contains aligned number of units */ + return ai; } -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, - unsigned int *cpup); - -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) { - int width = 1, v = nr_units; + int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + v = num_possible_cpus(); while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); } printk("\n"); } -#endif /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address - * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area * setup path. * - * @reserved_size, if non-zero, specifies the amount of bytes to + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size determines the number of bytes available for dynamic - * allocation in the first chunk. The area between @static_size + - * @reserved_size + @dyn_size and @unit_size is unused. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size specifies unit size and must be aligned to PAGE_SIZE and - * equal to or larger than @static_size + @reserved_size + if - * non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. @@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map) +size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + dyn_size; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu, tcpu; - int i; + unsigned int cpu; + int *unit_map; + int group, unit, i; /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); + BUG_ON(ai->nr_groups <= 0); + BUG_ON(!ai->static_size); BUG_ON(!base_addr); - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + BUG_ON(ai->unit_size < size_sum); + BUG_ON(ai->unit_size & ~PAGE_MASK); + BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + pcpu_dump_alloc_info(KERN_DEBUG, ai); /* determine number of units and verify and initialize pcpu_unit_map */ - if (unit_map) { - int first_unit = INT_MAX, last_unit = INT_MIN; - - for_each_possible_cpu(cpu) { - int unit = unit_map[cpu]; - - BUG_ON(unit < 0); - for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; - /* the mapping should be one-to-one */ - BUG_ON(unit_map[tcpu] == unit); - } + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - if (unit < first_unit) { - pcpu_first_unit_cpu = cpu; - first_unit = unit; - } - if (unit > last_unit) { - pcpu_last_unit_cpu = cpu; - last_unit = unit; - } - } - pcpu_nr_units = last_unit + 1; - pcpu_unit_map = unit_map; - } else { - int *identity_map; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = NR_CPUS; + pcpu_first_unit_cpu = NR_CPUS; - /* #units == #cpus, identity mapped */ - identity_map = alloc_bootmem(nr_cpu_ids * - sizeof(identity_map[0])); + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; - for_each_possible_cpu(cpu) - identity_map[cpu] = cpu; + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_first_unit_cpu = 0; - pcpu_last_unit_cpu = pcpu_nr_units - 1; - pcpu_nr_units = nr_cpu_ids; - pcpu_unit_map = identity_map; + BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); + BUG_ON(unit_map[cpu] != NR_CPUS); + + unit_map[cpu] = unit + i; + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + BUG_ON(unit_map[cpu] == NR_CPUS); + + pcpu_unit_map = unit_map; /* determine basic parameters */ - pcpu_unit_pages = unit_size >> PAGE_SHIFT; + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + @@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; @@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup); */ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t size_sum, unit_size, chunk_size; + struct pcpu_alloc_info *ai; + size_t size_sum, chunk_size; void *base; - unsigned int cpu; + int unit; + ssize_t ret; - /* determine parameters and allocate */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - chunk_size = unit_size * nr_cpu_ids; + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + chunk_size = ai->unit_size * num_possible_cpus(); base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - return -ENOMEM; + ret = -ENOMEM; + goto out_free_ai; } /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = base + cpu * unit_size; - - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + size_sum), - unit_size - size_sum); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), unit_size); + for (unit = 0; unit < num_possible_cpus(); unit++) { + void *ptr = base + unit * ai->unit_size; + + free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); + memcpy(ptr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size, - unit_size); + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); - return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base, NULL); + ret = pcpu_setup_first_chunk(ai, base); +out_free_ai: + pcpu_free_alloc_info(ai); + return ret; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - ssize_t dyn_size = -1; - size_t size_sum, unit_size; + struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; - unsigned int cpu; - int i, j; + int unit, i, j; ssize_t ret; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - unit_pages = unit_size >> PAGE_SHIFT; + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; - for_each_possible_cpu(cpu) + for (unit = 0; unit < num_possible_cpus(); unit++) for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); @@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_size; + vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); - for_each_possible_cpu(cpu) { + for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = - (unsigned long)vm.addr + cpu * unit_size; + (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, */ /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, static_size); + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size, - dyn_size); + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -1790,6 +1905,7 @@ enomem: ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); return ret; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size; static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, +static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, unsigned int *cpup) { - unsigned int cpu; + int group, cunit; - for_each_possible_cpu(cpu) - if (unit_map[cpu] == unit) { + for (group = 0, cunit = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + if (unit < cunit + gi->nr_units) { if (cpup) - *cpup = cpu; + *cpup = gi->cpu_map[unit - cunit]; return true; } + cunit += gi->nr_units; + } return false; } +static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) +{ + int group, unit, i; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + for (i = 0; i < gi->nr_units; i++) + if (gi->cpu_map[i] == cpu) + return unit + i; + } + BUG(); +} + /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes - * @lpage_size: the size of a large page - * @unit_map: cpu -> unit mapping - * @nr_units: the number of units + * @ai: pcpu_alloc_info * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should always specify @dyn_size - * and @unit_size. These parameters along with @unit_map and - * @nr_units can be determined using pcpu_lpage_build_unit_map(). - * This two stage initialization is to allow arch code to evaluate the + * Unlike other helpers, the caller should provide fully initialized + * @ai. This can be done using pcpu_build_alloc_info(). This two + * stage initialization is to allow arch code to evaluate the * parameters before committing to it. * * Large pages are allocated as directed by @unit_map and other @@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, +ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t chunk_size = unit_size * nr_units; - size_t map_size; + const size_t lpage_size = ai->atom_size; + size_t chunk_size, map_size; unsigned int cpu; ssize_t ret; - int i, j, unit; + int i, j, unit, nr_units; - pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, - unit_size, lpage_size, unit_map, nr_units); + nr_units = 0; + for (i = 0; i < ai->nr_groups; i++) + nr_units += ai->groups[i].nr_units; + chunk_size = ai->unit_size * nr_units; BUG_ON(chunk_size % lpage_size); - pcpul_size = static_size + reserved_size + dyn_size; + pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; pcpul_lpage_size = lpage_size; pcpul_nr_lpages = chunk_size / lpage_size; @@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate all pages */ for (i = 0; i < pcpul_nr_lpages; i++) { size_t offset = i * lpage_size; - int first_unit = offset / unit_size; - int last_unit = (offset + lpage_size - 1) / unit_size; + int first_unit = offset / ai->unit_size; + int last_unit = (offset + lpage_size - 1) / ai->unit_size; void *ptr; /* find out which cpu is mapped to this unit */ for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + if (pcpul_unit_to_cpu(unit, ai, &cpu)) goto found; continue; found: @@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* return unused holes */ for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * unit_size; - size_t end = start + unit_size; + size_t start = unit * ai->unit_size; + size_t end = start + ai->unit_size; size_t off, next; /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + if (pcpul_unit_to_cpu(unit, ai, NULL)) start += pcpul_size; /* unit can span more than one page, punch the holes */ @@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate address, map and copy */ vm.flags = VM_ALLOC; vm.size = chunk_size; - vm_area_register_early(&vm, unit_size); + vm_area_register_early(&vm, ai->unit_size); for (i = 0; i < pcpul_nr_lpages; i++) { if (!pcpul_map[i].ptr) @@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, } for_each_possible_cpu(cpu) - memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, - static_size); + memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, + __per_cpu_load, ai->static_size); /* we're ready, commit */ pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, static_size, reserved_size, dyn_size, unit_size); + vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, unit_map); + ret = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped -- cgit v1.2.3-71-gd317 From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: add pcpu_unit_offsets[] Currently units are mapped sequentially into address space. This patch adds pcpu_unit_offsets[] which allows units to be mapped to arbitrary offsets from the chunk base address. This is necessary to allow sparse embedding which might would need to allocate address ranges and memory areas which aren't aligned to unit size but allocation atom size (page or large page size). This also simplifies things a bit by removing the need to calculate offset from unit number. With this change, there's no need for the arch code to know pcpu_unit_size. Update pcpu_setup_first_chunk() and first chunk allocators to return regular 0 or -errno return code instead of unit size or -errno. Signed-off-by: Tejun Heo Cc: David S. Miller --- arch/sparc/kernel/smp_64.c | 12 +++--- arch/x86/kernel/setup_percpu.c | 51 ++++++++++------------- include/linux/percpu.h | 16 ++++--- mm/percpu.c | 95 +++++++++++++++++++++--------------------- 4 files changed, 84 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a42a4a744d14..b03fd362c629 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void) static struct vm_struct vm; struct pcpu_alloc_info *ai; unsigned long delta, cpu; - size_t size_sum, pcpu_unit_size; + size_t size_sum; size_t ptrs_size; void **ptrs; + int rc; ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); @@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); + if (rc) + panic("failed to setup percpu first chunk (%d)", rc); free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { - __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - } + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; /* Setup %g5 for the boot cpu. */ __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index db5f9c49fec5..9becc5d4b518 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; struct pcpu_alloc_info *ai; - ssize_t ret; + int rc; /* on non-NUMA, embedding is better */ if (!chosen && !pcpu_need_numa()) @@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - ret = -EINVAL; + rc = -EINVAL; goto out_free; } } - ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, - pcpul_map); + rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: pcpu_free_alloc_info(ai); - return ret; + return rc; } #else -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(bool chosen) +static int __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(void) +static int __init setup_pcpu_page(void) { return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, @@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); @@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = -EINVAL; + rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(true); + rc = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(true); + rc = setup_pcpu_embed(true); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], ret); + pcpu_fc_names[pcpu_chosen_fc], rc); } } else { - ret = setup_pcpu_lpage(false); - if (ret < 0) - ret = setup_pcpu_embed(false); + rc = setup_pcpu_lpage(false); + if (rc < 0) + rc = setup_pcpu_embed(false); } - if (ret < 0) - ret = setup_pcpu_page(); - if (ret < 0) - panic("cannot initialize percpu area (err=%zd)", ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = setup_pcpu_page(); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = - delta + pcpu_unit_map[cpu] * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 77b86be8ce4f..a7ec840f596c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,7 +57,7 @@ #endif extern void *pcpu_base_addr; -extern const int *pcpu_unit_map; +extern const unsigned long *pcpu_unit_offsets; struct pcpu_group_info { int nr_units; /* aligned # of units */ @@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr); +extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern ssize_t __init pcpu_embed_first_chunk( - size_t reserved_size, ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, + ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -extern ssize_t __init pcpu_page_first_chunk( - size_t reserved_size, +extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern ssize_t __init pcpu_lpage_first_chunk( - const struct pcpu_alloc_info *ai, +extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 99f7fa682722..653b02c40200 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* cpu -> unit map */ -const int *pcpu_unit_map __read_mostly; +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ /* * The first chunk which always exists. Note that unlike other @@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, @@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; + addr += pcpu_unit_offsets[smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl, * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; @@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_dump_alloc_info(KERN_DEBUG, ai); - /* determine number of units and verify and initialize pcpu_unit_map */ + /* determine number of units and initialize unit_map and base */ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = NR_CPUS; @@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] != NR_CPUS); unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; } @@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] == NR_CPUS); pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = schunk->vm->addr; - return pcpu_unit_size; + return 0; } const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { @@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup); * size, the leftover is returned to the bootmem allocator. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { struct pcpu_alloc_info *ai; size_t size_sum, chunk_size; void *base; int unit; - ssize_t ret; + int rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); if (IS_ERR(ai)) @@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - ret = -ENOMEM; + rc = -ENOMEM; goto out_free_ai; } @@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, base); + rc = pcpu_setup_first_chunk(ai, base); out_free_ai: pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1813,13 +1816,12 @@ out_free_ai: * page-by-page into vmalloc area. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j; - ssize_t ret; + int unit, i, j, rc; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], - unit_pages); - if (ret < 0) - panic("failed to map percpu area, err=%zd\n", ret); + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); /* * FIXME: Archs with virtual cache should flush local @@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); - ret = -ENOMEM; + rc = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) * pcpu_lpage_remapped(). * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) +int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; const size_t lpage_size = ai->atom_size; size_t chunk_size, map_size; unsigned int cpu; - ssize_t ret; - int i, j, unit, nr_units; + int i, j, unit, nr_units, rc; nr_units = 0; for (i = 0; i < ai->nr_groups; i++) @@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped @@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) pcpul_nr_lpages--; - return ret; + return rc; enomem: for (i = 0; i < pcpul_nr_lpages; i++) @@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - ssize_t unit_size; unsigned long delta; unsigned int cpu; + int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); - if (unit_size < 0) + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE); + if (rc < 0) panic("Failed to initialized percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) - __per_cpu_offset[cpu] = delta + cpu * unit_size; + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-71-gd317 From ca23e405e06d5fffb005df004c72781f76062f51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: vmalloc: implement pcpu_get_vm_areas() To directly use spread NUMA memories for percpu units, percpu allocator will be updated to allow sparsely mapping units in a chunk. As the distances between units can be very large, this makes allocating single vmap area for each chunk undesirable. This patch implements pcpu_get_vm_areas() and pcpu_free_vm_areas() which allocates and frees sparse congruent vmap areas. pcpu_get_vm_areas() take @offsets and @sizes array which define distances and sizes of vmap areas. It scans down from the top of vmalloc area looking for the top-most address which can accomodate all the areas. The top-down scan is to avoid interacting with regular vmallocs which can push up these congruent areas up little by little ending up wasting address space and page table. To speed up top-down scan, the highest possible address hint is maintained. Although the scan is linear from the hint, given the usual large holes between memory addresses between NUMA nodes, the scanning is highly likely to finish after finding the first hole for the last unit which is scanned first. Signed-off-by: Tejun Heo Cc: Nick Piggin --- include/linux/vmalloc.h | 6 + mm/vmalloc.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a43ebec3a7b9..227c2a585e4f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask); + +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); + #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2eb461c3a46e..204b8243d8ab 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -265,6 +265,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -1038,6 +1048,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1821,6 +1834,286 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) -- cgit v1.2.3-71-gd317 From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: percpu: update embedding first chunk allocator to handle sparse units Now that percpu core can handle very sparse units, given that vmalloc space is large enough, embedding first chunk allocator can use any memory to build the first chunk. This patch teaches pcpu_embed_first_chunk() about distances between cpus and to use alloc/free callbacks to allocate node specific areas for each group and use them for the first chunk. This brings the benefits of embedding allocator to NUMA configurations - no extra TLB pressure with the flexibility of unified dynamic allocator and no need to restructure arch code to build memory layout suitable for percpu. With units put into atom_size aligned groups according to cpu distances, using large page for dynamic chunks is also easily possible with falling back to reuglar pages if large allocation fails. Embedding allocator users are converted to specify NULL cpu_distance_fn, so this patch doesn't cause any visible behavior difference. Following patches will convert them. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +- include/linux/percpu.h | 7 ++- mm/percpu.c | 113 +++++++++++++++++++++++++++++++---------- 3 files changed, 93 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9becc5d4b518..67f6314de9f1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen) return -EINVAL; return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE); + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PAGE_SIZE, NULL, pcpu_fc_alloc, + pcpu_fc_free); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a7ec840f596c..25359932740e 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern int __init pcpu_embed_first_chunk(size_t reserved_size, - ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index cc9c4c64606d..c2826d05505c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup); * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than * specified to fill page alignment. When @dyn_size is auto, @@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup); * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { + void *base = (void *)ULONG_MAX; + void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, chunk_size; - void *base; - int unit; - int rc; + size_t size_sum, areas_size; + int group, i, rc; - ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); - BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; - chunk_size = ai->unit_size * num_possible_cpus(); + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!base) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { rc = -ENOMEM; - goto out_free_ai; + goto out_free; } - /* return the leftover and copy */ - for (unit = 0; unit < num_possible_cpus(); unit++) { - void *ptr = base + unit * ai->unit_size; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; - free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); - memcpy(ptr, __per_cpu_load, ai->static_size); + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* we're ready, commit */ + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) + ai->groups[group].base_offset = areas[group] - base; + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); -out_free_ai: + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || @@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-71-gd317 From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: percpu: kill lpage first chunk allocator With x86 converted to embedding allocator, lpage doesn't have any user left. Kill it along with cpa handling code. Signed-off-by: Tejun Heo Cc: Jan Beulich --- Documentation/kernel-parameters.txt | 10 +- arch/x86/mm/pageattr.c | 20 +-- include/linux/percpu.h | 16 --- mm/percpu.c | 241 ------------------------------------ 4 files changed, 6 insertions(+), 281 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dee9ce2e6cfa..e710093e3d32 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= Select which percpu first chunk allocator to use. - Currently supported values are "embed", "page" and - "lpage". Archs may support subset or none of the - selections. See comments in mm/percpu.c for details - on each allocator. This parameter is primarily for - debugging and performance comparison. + Currently supported values are "embed" and "page". + Archs may support subset or none of the selections. + See comments in mm/percpu.c for details on each + allocator. This parameter is primarily for debugging + and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dce282f65700..f53cfc7f963d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 25359932740e..878836ca999c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -82,7 +82,6 @@ enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, PCPU_FC_PAGE, - PCPU_FC_LPAGE, PCPU_FC_NR, }; @@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); -typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units); @@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn); - -extern void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index c2826d05505c..77933928107d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", - [PCPU_FC_LPAGE] = "lpage", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; @@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; -#endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK - else if (!strcmp(str, "lpage")) - pcpu_chosen_fc = PCPU_FC_LPAGE; #endif else pr_warning("PERCPU: unknown allocator %s specified\n", str); @@ -1970,242 +1965,6 @@ out_free_ar: } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -struct pcpul_ent { - void *ptr; - void *map_addr; -}; - -static size_t pcpul_size; -static size_t pcpul_lpage_size; -static int pcpul_nr_lpages; -static struct pcpul_ent *pcpul_map; - -static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, - unsigned int *cpup) -{ - int group, cunit; - - for (group = 0, cunit = 0; group < ai->nr_groups; group++) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - if (unit < cunit + gi->nr_units) { - if (cpup) - *cpup = gi->cpu_map[unit - cunit]; - return true; - } - cunit += gi->nr_units; - } - - return false; -} - -static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) -{ - int group, unit, i; - - for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - for (i = 0; i < gi->nr_units; i++) - if (gi->cpu_map[i] == cpu) - return unit + i; - } - BUG(); -} - -/** - * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @ai: pcpu_alloc_info - * @alloc_fn: function to allocate percpu lpage, always called with lpage_size - * @free_fn: function to free percpu memory, @size <= lpage_size - * @map_fn: function to map percpu lpage, always called with lpage_size - * - * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should provide fully initialized - * @ai. This can be done using pcpu_build_alloc_info(). This two - * stage initialization is to allow arch code to evaluate the - * parameters before committing to it. - * - * Large pages are allocated as directed by @unit_map and other - * parameters and mapped to vmalloc space. Unused holes are returned - * to the page allocator. Note that these holes end up being actively - * mapped twice - once to the physical mapping and to the vmalloc area - * for the first percpu chunk. Depending on architecture, this might - * cause problem when changing page attributes of the returned area. - * These double mapped areas can be detected using - * pcpu_lpage_remapped(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - static struct vm_struct vm; - const size_t lpage_size = ai->atom_size; - size_t chunk_size, map_size; - unsigned int cpu; - int i, j, unit, nr_units, rc; - - nr_units = 0; - for (i = 0; i < ai->nr_groups; i++) - nr_units += ai->groups[i].nr_units; - - chunk_size = ai->unit_size * nr_units; - BUG_ON(chunk_size % lpage_size); - - pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; - pcpul_lpage_size = lpage_size; - pcpul_nr_lpages = chunk_size / lpage_size; - - /* allocate pointer array and alloc large pages */ - map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); - pcpul_map = alloc_bootmem(map_size); - - /* allocate all pages */ - for (i = 0; i < pcpul_nr_lpages; i++) { - size_t offset = i * lpage_size; - int first_unit = offset / ai->unit_size; - int last_unit = (offset + lpage_size - 1) / ai->unit_size; - void *ptr; - - /* find out which cpu is mapped to this unit */ - for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, ai, &cpu)) - goto found; - continue; - found: - ptr = alloc_fn(cpu, lpage_size, lpage_size); - if (!ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - pcpul_map[i].ptr = ptr; - } - - /* return unused holes */ - for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * ai->unit_size; - size_t end = start + ai->unit_size; - size_t off, next; - - /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, ai, NULL)) - start += pcpul_size; - - /* unit can span more than one page, punch the holes */ - for (off = start; off < end; off = next) { - void *ptr = pcpul_map[off / lpage_size].ptr; - next = min(roundup(off + 1, lpage_size), end); - if (ptr) - free_fn(ptr + off % lpage_size, next - off); - } - } - - /* allocate address, map and copy */ - vm.flags = VM_ALLOC; - vm.size = chunk_size; - vm_area_register_early(&vm, ai->unit_size); - - for (i = 0; i < pcpul_nr_lpages; i++) { - if (!pcpul_map[i].ptr) - continue; - pcpul_map[i].map_addr = vm.addr + i * lpage_size; - map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); - } - - for_each_possible_cpu(cpu) - memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, - __per_cpu_load, ai->static_size); - - /* we're ready, commit */ - pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, - ai->unit_size); - - rc = pcpu_setup_first_chunk(ai, vm.addr); - - /* - * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped - * lpages are pushed to the end and trimmed. - */ - for (i = 0; i < pcpul_nr_lpages - 1; i++) - for (j = i + 1; j < pcpul_nr_lpages; j++) { - struct pcpul_ent tmp; - - if (!pcpul_map[j].ptr) - continue; - if (pcpul_map[i].ptr && - pcpul_map[i].ptr < pcpul_map[j].ptr) - continue; - - tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) - pcpul_nr_lpages--; - - return rc; - -enomem: - for (i = 0; i < pcpul_nr_lpages; i++) - if (pcpul_map[i].ptr) - free_fn(pcpul_map[i].ptr, lpage_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu large - * page mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used large - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - unsigned long lpage_mask = pcpul_lpage_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); - unsigned long offset = (unsigned long)kaddr & lpage_mask; - int left = 0, right = pcpul_nr_lpages - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < lpage_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > lpage_addr) - right = pos - 1; - else - return pcpul_map[pos].map_addr + offset; - } - - return NULL; -} -#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ - /* * Generic percpu area setup. * -- cgit v1.2.3-71-gd317 From 31089c13bcb18d2cd2a3ddfbe3a28666346f237e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 14 Aug 2009 15:47:18 +0200 Subject: timekeeping: Introduce timekeeping_leap_insert Move the adjustment of xtime, wall_to_monotonic and the update of the vsyscall variables to the timekeeping code. Signed-off-by: John Stultz Signed-off-by: Martin Schwidefsky LKML-Reference: <20090814134807.609730216@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/time.h | 1 + kernel/time/ntp.c | 7 ++----- kernel/time/timekeeping.c | 7 +++++++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time.h b/include/linux/time.h index ea16c1a01d51..e7c844558884 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -147,6 +147,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_valid_for_hres(void); extern void update_wall_time(void); extern void update_xtime_cache(u64 nsec); +extern void timekeeping_leap_insert(int leapsecond); struct tms; extern void do_sys_times(struct tms *); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc64375ff43..4800f933910e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c0b2c9c674..b8b70fb545fc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,6 +58,13 @@ void update_xtime_cache(u64 nsec) struct clocksource *clock; +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, clock); +} #ifdef CONFIG_GENERIC_TIME /** -- cgit v1.2.3-71-gd317 From a0f7d48bfb95a4c5172a2756dbc4b82afc8e9ae4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:19 +0200 Subject: timekeeping: Remove clocksource inline functions The three inline functions clocksource_read, clocksource_enable and clocksource_disable are simple wrappers of an indirect call plus the copy from and to the mult_orig value. The functions are exclusively used by the timekeeping code which has intimate knowledge of the clocksource anyway. Therefore remove the inline functions. No functional change. Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134807.903108946@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 58 --------------------------------------------- kernel/time/timekeeping.c | 41 ++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 1219be4fb42e..a1ef46f61c81 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -267,64 +267,6 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) return (u32)tmp; } -/** - * clocksource_read: - Access the clocksource's current cycle value - * @cs: pointer to clocksource being read - * - * Uses the clocksource to return the current cycle_t value - */ -static inline cycle_t clocksource_read(struct clocksource *cs) -{ - return cs->read(cs); -} - -/** - * clocksource_enable: - enable clocksource - * @cs: pointer to clocksource - * - * Enables the specified clocksource. The clocksource callback - * function should start up the hardware and setup mult and field - * members of struct clocksource to reflect hardware capabilities. - */ -static inline int clocksource_enable(struct clocksource *cs) -{ - int ret = 0; - - if (cs->enable) - ret = cs->enable(cs); - - /* - * The frequency may have changed while the clocksource - * was disabled. If so the code in ->enable() must update - * the mult value to reflect the new frequency. Make sure - * mult_orig follows this change. - */ - cs->mult_orig = cs->mult; - - return ret; -} - -/** - * clocksource_disable: - disable clocksource - * @cs: pointer to clocksource - * - * Disables the specified clocksource. The clocksource callback - * function should power down the now unused hardware block to - * save power. - */ -static inline void clocksource_disable(struct clocksource *cs) -{ - /* - * Save mult_orig in mult so clocksource_enable() can - * restore the value regardless if ->enable() updates - * the value of mult or not. - */ - cs->mult = cs->mult_orig; - - if (cs->disable) - cs->disable(cs); -} - /** * cyc2ns - converts clocksource cycles to nanoseconds * @cs: Pointer to clocksource diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b8b70fb545fc..016a2591d719 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -79,7 +79,7 @@ static void clocksource_forward_now(void) cycle_t cycle_now, cycle_delta; s64 nsec; - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; @@ -114,7 +114,7 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -146,7 +146,7 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -186,7 +186,7 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -274,16 +274,29 @@ static void change_clocksource(void) clocksource_forward_now(); - if (clocksource_enable(new)) + if (new->enable && !new->enable(new)) return; + /* + * The frequency may have changed while the clocksource + * was disabled. If so the code in ->enable() must update + * the mult value to reflect the new frequency. Make sure + * mult_orig follows this change. + */ + new->mult_orig = new->mult; new->raw_time = clock->raw_time; old = clock; clock = new; - clocksource_disable(old); + /* + * Save mult_orig in mult so that the value can be restored + * regardless if ->enable() updates the value of mult or not. + */ + old->mult = old->mult_orig; + if (old->disable) + old->disable(old); clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -373,7 +386,7 @@ void getrawmonotonic(struct timespec *ts) seq = read_seqbegin(&xtime_lock); /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -435,9 +448,12 @@ void __init timekeeping_init(void) ntp_init(); clock = clocksource_get_next(); - clocksource_enable(clock); + if (clock->enable) + clock->enable(clock); + /* set mult_orig on enable */ + clock->mult_orig = clock->mult; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; @@ -477,8 +493,7 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -630,7 +645,7 @@ void update_wall_time(void) return; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif -- cgit v1.2.3-71-gd317 From f1b82746c1e93daf24e1ab9bfbd39bcdb2e7018b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:21 +0200 Subject: clocksource: Cleanup clocksource selection If a non high-resolution clocksource is first set as override clock and then registered it becomes active even if the system is in one-shot mode. Move the override check from sysfs_override_clocksource to the clocksource selection. That fixes the bug and simplifies the code. The check in clocksource_register for double registration of the same clocksource is removed without replacement. To find the initial clocksource a new weak function in jiffies.c is defined that returns the jiffies clocksource. The architecture code can then override the weak function with a more suitable clocksource, e.g. the TOD clock on s390. [ tglx: Folded in a fix from John Stultz ] Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.388024160@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 4 ++ include/linux/clocksource.h | 2 + kernel/time/clocksource.c | 134 +++++++++++++++++--------------------------- kernel/time/jiffies.c | 6 +- kernel/time/timekeeping.c | 4 +- 5 files changed, 64 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index d4c8e9c47c81..afefe514df0f 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -205,6 +205,10 @@ static struct clocksource clocksource_tod = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_tod; +} void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index a1ef46f61c81..f263b3abf46e 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -322,6 +323,7 @@ extern void clocksource_touch_watchdog(void); extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); +extern struct clocksource * __init __weak clocksource_default_clock(void); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb811251..e91662e87cde 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include @@ -107,12 +106,9 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. + * currently selected clocksource. * next_clocksource: * pending next selected clocksource. * clocksource_list: @@ -123,9 +119,8 @@ extern struct clocksource clocksource_jiffies; * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *curr_clocksource; static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -320,6 +315,7 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME /** * clocksource_get_next - Returns the selected clocksource * @@ -339,56 +335,65 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Selects the best registered clocksource. + * clocksource_select - Select the best clocksource available * * Private function. Must hold clocksource_lock when called. * * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static struct clocksource *select_clocksource(void) +static void clocksource_select(void) { - struct clocksource *next; + struct clocksource *best, *cs; if (list_empty(&clocksource_list)) - return NULL; + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) + next_clocksource = best; +} - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); +#else /* CONFIG_GENERIC_TIME */ - if (next == curr_clocksource) - return NULL; +static void clocksource_select(void) { } - return next; -} +#endif /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; - - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,19 +402,16 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { unsigned long flags; - int ret; spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); + clocksource_enqueue(cs); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + clocksource_check_watchdog(cs); + return 0; } EXPORT_SYMBOL(clocksource_register); @@ -425,7 +427,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -438,9 +440,7 @@ void clocksource_unregister(struct clocksource *cs) spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -478,9 +478,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -495,37 +493,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; - - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } + clocksource_select(); spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30816e3..5404a8456909 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b5673016089f..325a9b63265a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -269,7 +269,7 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (clock == new) + if (!new || clock == new) return; clocksource_forward_now(); @@ -446,7 +446,7 @@ void __init timekeeping_init(void) ntp_init(); - clock = clocksource_get_next(); + clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); /* set mult_orig on enable */ -- cgit v1.2.3-71-gd317 From c55c87c892c1875deace0c8fc28787335277fdf2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:25 +0200 Subject: clocksource: Move watchdog downgrade to a work queue thread Move the downgrade of an unstable clocksource from the timer interrupt context into the process context of a work queue thread. This is needed to be able to do the clocksource switch with stop_machine. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.354926067@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 56 +++++++++++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f263b3abf46e..19ad43af62d0 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -213,6 +213,7 @@ extern struct clocksource *clock; /* current clocksource */ #define CLOCK_SOURCE_WATCHDOG 0x10 #define CLOCK_SOURCE_VALID_FOR_HRES 0x20 +#define CLOCK_SOURCE_UNSTABLE 0x40 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56aaa749645d..f1508019bfb4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -143,10 +143,13 @@ fs_initcall(clocksource_done_booting); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static struct work_struct watchdog_work; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; static int watchdog_running; +static void clocksource_watchdog_work(struct work_struct *work); + /* * Interval: 0.5sec Threshold: 0.0625s */ @@ -158,15 +161,16 @@ static void clocksource_unstable(struct clocksource *cs, int64_t delta) printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + schedule_work(&watchdog_work); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; + int next_cpu; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -176,7 +180,12 @@ static void clocksource_watchdog(unsigned long data) wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) + continue; + csnow = cs->read(cs); /* Clocksource initialized ? */ @@ -207,19 +216,15 @@ static void clocksource_watchdog(unsigned long data) } } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); - - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); - } + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); out: spin_unlock(&watchdog_lock); } @@ -228,6 +233,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; + INIT_WORK(&watchdog_work, clocksource_watchdog_work); init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; watchdog_last = watchdog->read(watchdog); @@ -313,6 +319,22 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } +static void clocksource_watchdog_work(struct work_struct *work) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + clocksource_change_rating(cs, 0); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock(&watchdog_lock); +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) -- cgit v1.2.3-71-gd317 From 155ec60226ae0ae2aadaa57c951a58a359331030 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:26 +0200 Subject: timekeeping: Introduce struct timekeeper Add struct timekeeper to keep the internal values timekeeping.c needs in regard to the currently selected clock source. This moves the timekeeping intervals, xtime_nsec and the ntp error value from struct clocksource to struct timekeeper. The raw_time is removed from the clocksource as well. It gets treated like xtime as a global variable. Eventually xtime raw_time should be moved to struct timekeeper. [ tglx: minor cleanup ] Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.613209842@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 1 - include/linux/clocksource.h | 54 +--------- kernel/time/clocksource.c | 6 +- kernel/time/timekeeping.c | 235 +++++++++++++++++++++++++++++--------------- 4 files changed, 164 insertions(+), 132 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index afefe514df0f..e76c2e7a8b9a 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -280,7 +280,6 @@ void __init time_init(void) now = get_clock(); tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); clocksource_tod.cycle_last = now; - clocksource_tod.raw_time = xtime; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 19ad43af62d0..e12e3095e2fb 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -155,8 +155,6 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @flags: flags describing special properties * @vread: vsyscall based read * @resume: resume function for the clocksource, if necessary - * @cycle_interval: Used internally by timekeeping core, please ignore. - * @xtime_interval: Used internally by timekeeping core, please ignore. */ struct clocksource { /* @@ -182,19 +180,12 @@ struct clocksource { #define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0) #endif - /* timekeeping specific data, ignore */ - cycle_t cycle_interval; - u64 xtime_interval; - u32 raw_interval; /* * Second part is written at each timer interrupt * Keep it in a different cache line to dirty no * more than one cache line. */ cycle_t cycle_last ____cacheline_aligned_in_smp; - u64 xtime_nsec; - s64 error; - struct timespec raw_time; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ @@ -203,8 +194,6 @@ struct clocksource { #endif }; -extern struct clocksource *clock; /* current clocksource */ - /* * Clock source flags bits:: */ @@ -270,50 +259,15 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) } /** - * cyc2ns - converts clocksource cycles to nanoseconds - * @cs: Pointer to clocksource - * @cycles: Cycles + * clocksource_cyc2ns - converts clocksource cycles to nanoseconds * - * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds. + * Converts cycles to nanoseconds, using the given mult and shift. * * XXX - This could use some mult_lxl_ll() asm optimization */ -static inline s64 cyc2ns(struct clocksource *cs, cycle_t cycles) +static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) { - u64 ret = (u64)cycles; - ret = (ret * cs->mult) >> cs->shift; - return ret; -} - -/** - * clocksource_calculate_interval - Calculates a clocksource interval struct - * - * @c: Pointer to clocksource. - * @length_nsec: Desired interval length in nanoseconds. - * - * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment - * pair and interval request. - * - * Unless you're the timekeeping code, you should not be using this! - */ -static inline void clocksource_calculate_interval(struct clocksource *c, - unsigned long length_nsec) -{ - u64 tmp; - - /* Do the ns -> cycle conversion first, using original mult */ - tmp = length_nsec; - tmp <<= c->shift; - tmp += c->mult_orig/2; - do_div(tmp, c->mult_orig); - - c->cycle_interval = (cycle_t)tmp; - if (c->cycle_interval == 0) - c->cycle_interval = 1; - - /* Go back from cycles -> shifted ns, this time use ntp adjused mult */ - c->xtime_interval = (u64)c->cycle_interval * c->mult; - c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift; + return ((u64) cycles * mult) >> shift; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f1508019bfb4..f18c9a6bdcf4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -177,7 +177,8 @@ static void clocksource_watchdog(unsigned long data) goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; list_for_each_entry(cs, &watchdog_list, wd_list) { @@ -196,7 +197,8 @@ static void clocksource_watchdog(unsigned long data) } /* Check the deviation from the watchdog clocksource. */ - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); cs->wd_last = csnow; if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { clocksource_unstable(cs, cs_nsec - wd_nsec); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 325a9b63265a..7af45cbf6b13 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -19,6 +19,65 @@ #include #include +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult_orig/2; + do_div(tmp, clock->mult_orig); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult_orig) >> clock->shift; + + timekeeper.xtime_nsec = 0; + + timekeeper.ntp_error = 0; +} /* * This read-write spinlock protects us from races in SMP while @@ -46,6 +105,11 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,42 +120,42 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } #ifdef CONFIG_GENERIC_TIME /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; + clock = timekeeper.clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -103,6 +167,7 @@ static void clocksource_forward_now(void) void getnstimeofday(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned long seq; s64 nsecs; @@ -114,13 +179,15 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -135,6 +202,7 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned int seq; s64 secs, nsecs; @@ -146,13 +214,15 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs += cyc2ns(clock, cycle_delta); + nsecs += clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); /* @@ -174,6 +244,7 @@ EXPORT_SYMBOL_GPL(ktime_get); void ktime_get_ts(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; struct timespec tomono; unsigned int seq; s64 nsecs; @@ -186,13 +257,15 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); @@ -233,7 +306,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -243,10 +316,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -269,10 +342,10 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (!new || clock == new) + if (!new || timekeeper.clock == new) return; - clocksource_forward_now(); + timekeeping_forward_now(); if (new->enable && !new->enable(new)) return; @@ -284,9 +357,9 @@ static void change_clocksource(void) */ new->mult_orig = new->mult; - new->raw_time = clock->raw_time; - old = clock; - clock = new; + old = timekeeper.clock; + timekeeper_setup_internals(new); + /* * Save mult_orig in mult so that the value can be restored * regardless if ->enable() updates the value of mult or not. @@ -295,22 +368,10 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = clock->read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - tick_clock_notify(); - - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ } #else /* GENERIC_TIME */ -static inline void clocksource_forward_now(void) { } +static inline void timekeeping_forward_now(void) { } static inline void change_clocksource(void) { } /** @@ -380,20 +441,23 @@ void getrawmonotonic(struct timespec *ts) unsigned long seq; s64 nsecs; cycle_t cycle_now, cycle_delta; + struct clocksource *clock; do { seq = read_seqbegin(&xtime_lock); /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig, + clock->shift); - *ts = clock->raw_time; + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -413,7 +477,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -439,6 +503,7 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; unsigned long sec = read_persistent_clock(); @@ -451,11 +516,13 @@ void __init timekeeping_init(void) clock->enable(clock); /* set mult_orig on enable */ clock->mult_orig = clock->mult; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clock->read(clock); + + timekeeper_setup_internals(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); @@ -492,8 +559,8 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = clock->read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -514,7 +581,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -549,7 +616,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -565,7 +632,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -574,8 +641,9 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -600,18 +668,19 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -619,15 +688,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.clock->mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + (NTP_SCALE_SHIFT - timekeeper.clock->shift); } /** @@ -637,53 +706,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + s64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ - while (offset >= clock->cycle_interval) { + while (offset >= timekeeper.cycle_interval) { + u64 nsecps = (u64)NSEC_PER_SEC << clock->shift; + /* accumulate one interval */ - offset -= clock->cycle_interval; - clock->cycle_last += clock->cycle_interval; + offset -= timekeeper.cycle_interval; + clock->cycle_last += timekeeper.cycle_interval; - clock->xtime_nsec += clock->xtime_interval; - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + timekeeper.xtime_nsec += timekeeper.xtime_interval; + if (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; second_overflow(); } - clock->raw_time.tv_nsec += clock->raw_interval; - if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { - clock->raw_time.tv_nsec -= NSEC_PER_SEC; - clock->raw_time.tv_sec++; + raw_time.tv_nsec += timekeeper.raw_interval; + if (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; } /* accumulate error between NTP and clock interval */ - clock->error += tick_length; - clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); + timekeeper.ntp_error += tick_length; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (NTP_SCALE_SHIFT - clock->shift); } /* correct the clock when NTP error is too big */ - clocksource_adjust(offset); + timekeeping_adjust(offset); /* * Since in the loop above, we accumulate any amount of time * in xtime_nsec over a second into xtime.tv_sec, its possible for * xtime_nsec to be fairly small after the loop. Further, if we're - * slightly speeding the clocksource up in clocksource_adjust(), + * slightly speeding the clocksource up in timekeeping_adjust(), * its possible the required corrective factor to xtime_nsec could * cause it to underflow. * @@ -695,24 +770,26 @@ void update_wall_time(void) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)clock->xtime_nsec < 0)) { - s64 neg = -(s64)clock->xtime_nsec; - clock->xtime_nsec = 0; - clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + if (unlikely((s64)timekeeper.xtime_nsec < 0)) { + s64 neg = -(s64)timekeeper.xtime_nsec; + timekeeper.xtime_nsec = 0; + timekeeper.ntp_error += neg << (NTP_SCALE_SHIFT - clock->shift); } /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); + xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> clock->shift) + 1; + timekeeper.xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + (NTP_SCALE_SHIFT - clock->shift); - update_xtime_cache(cyc2ns(clock, offset)); + nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift); + update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ change_clocksource(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } /** -- cgit v1.2.3-71-gd317 From 0a54419836254a27baecd9037103171bcbabaf67 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:28 +0200 Subject: timekeeping: Move NTP adjusted clock multiplier to struct timekeeper The clocksource structure has two multipliers, the unmodified multiplier clock->mult_orig and the NTP corrected multiplier clock->mult. The NTP multiplier is misplaced in the struct clocksource, this is private information of the timekeeping code. Add the mult field to the struct timekeeper to contain the NTP corrected value, keep the unmodifed multiplier in clock->mult and remove clock->mult_orig. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134810.149047645@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/arm/plat-omap/common.c | 7 ++---- include/linux/clocksource.h | 4 +--- kernel/time/timekeeping.c | 53 ++++++++++++++++++++------------------------- 3 files changed, 27 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c index ebcf006406f9..95587b6c0259 100644 --- a/arch/arm/plat-omap/common.c +++ b/arch/arm/plat-omap/common.c @@ -253,11 +253,8 @@ static struct clocksource clocksource_32k = { */ unsigned long long sched_clock(void) { - unsigned long long ret; - - ret = (unsigned long long)clocksource_32k.read(&clocksource_32k); - ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift; - return ret; + return clocksource_cyc2ns(clocksource_32k.read(&clocksource_32k), + clocksource_32k.mult, clocksource_32k.shift); } static int __init omap_init_clocksource_32k(void) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e12e3095e2fb..e34015effeb6 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,8 +149,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @disable: optional function to disable the clocksource * @mask: bitmask for two's complement * subtraction of non 64 bit counters - * @mult: cycle to nanosecond multiplier (adjusted by NTP) - * @mult_orig: cycle to nanosecond multiplier (unadjusted by NTP) + * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) * @flags: flags describing special properties * @vread: vsyscall based read @@ -168,7 +167,6 @@ struct clocksource { void (*disable)(struct clocksource *cs); cycle_t mask; u32 mult; - u32 mult_orig; u32 shift; unsigned long flags; cycle_t (*vread)(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dfdab1cefe1e..f4056f6c2632 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,6 +41,8 @@ struct timekeeper { /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; }; struct timekeeper timekeeper; @@ -66,8 +68,8 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; - tmp += clock->mult_orig/2; - do_div(tmp, clock->mult_orig); + tmp += clock->mult/2; + do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; @@ -77,13 +79,20 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Go back from cycles -> shifted ns */ timekeeper.xtime_interval = (u64) interval * clock->mult; timekeeper.raw_interval = - ((u64) interval * clock->mult_orig) >> clock->shift; + ((u64) interval * clock->mult) >> clock->shift; timekeeper.xtime_nsec = 0; timekeeper.shift = clock->shift; timekeeper.ntp_error = 0; timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; } /* @@ -154,14 +163,15 @@ static void timekeeping_forward_now(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift); + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); timespec_add_ns(&raw_time, nsec); } @@ -193,8 +203,8 @@ void getnstimeofday(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -228,8 +238,8 @@ ktime_t ktime_get(void) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs += clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs += clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); } while (read_seqretry(&xtime_lock, seq)); /* @@ -271,8 +281,8 @@ void ktime_get_ts(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, - clock->shift); + nsecs = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); } while (read_seqretry(&xtime_lock, seq)); @@ -356,22 +366,10 @@ static void change_clocksource(void) if (new->enable && !new->enable(new)) return; - /* - * The frequency may have changed while the clocksource - * was disabled. If so the code in ->enable() must update - * the mult value to reflect the new frequency. Make sure - * mult_orig follows this change. - */ - new->mult_orig = new->mult; old = timekeeper.clock; timekeeper_setup_internals(new); - /* - * Save mult_orig in mult so that the value can be restored - * regardless if ->enable() updates the value of mult or not. - */ - old->mult = old->mult_orig; if (old->disable) old->disable(old); @@ -461,7 +459,7 @@ void getrawmonotonic(struct timespec *ts) cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig, + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); *ts = raw_time; @@ -521,9 +519,6 @@ void __init timekeeping_init(void) clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - /* set mult_orig on enable */ - clock->mult_orig = clock->mult; - timekeeper_setup_internals(clock); xtime.tv_sec = sec; @@ -697,7 +692,7 @@ static void timekeeping_adjust(s64 offset) } else return; - timekeeper.clock->mult += adj; + timekeeper.mult += adj; timekeeper.xtime_interval += interval; timekeeper.xtime_nsec -= offset; timekeeper.ntp_error -= (interval - offset) << @@ -789,7 +784,7 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, clock->mult, clock->shift); + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ -- cgit v1.2.3-71-gd317 From 75c5158f70c065b9704b924503d96e8297838f79 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:30 +0200 Subject: timekeeping: Update clocksource with stop_machine update_wall_time calls change_clocksource HZ times per second to check if a new clock source is available. In close to 100% of all calls there is no new clock. Replace the tick based check by an update done with stop_machine. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134810.711836357@de.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 2 + kernel/time/clocksource.c | 112 +++++++++++++++++--------------------------- kernel/time/timekeeping.c | 41 ++++++++++------ 3 files changed, 72 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e34015effeb6..9ea40ff26f0e 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -291,4 +291,6 @@ static inline void update_vsyscall_tz(void) } #endif +extern void timekeeping_notify(struct clocksource *clock); + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f18c9a6bdcf4..a1657b5fdeb9 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -109,35 +109,17 @@ EXPORT_SYMBOL(timecounter_cyc2time); /*[Clocksource internal variables]--------- * curr_clocksource: * currently selected clocksource. - * next_clocksource: - * pending next selected clocksource. * clocksource_list: * linked list with the registered clocksources - * clocksource_lock: - * protects manipulations to curr_clocksource and next_clocksource - * and the clocksource_list + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list * override_name: * Name of the user-specified clocksource. */ static struct clocksource *curr_clocksource; -static struct clocksource *next_clocksource; static LIST_HEAD(clocksource_list); -static DEFINE_SPINLOCK(clocksource_lock); +static DEFINE_MUTEX(clocksource_mutex); static char override_name[32]; -static int finished_booting; - -/* clocksource_done_booting - Called near the end of core bootup - * - * Hack to avoid lots of clocksource churn at boot time. - * We use fs_initcall because we want this to start before - * device_initcall but after subsys_initcall. - */ -static int __init clocksource_done_booting(void) -{ - finished_booting = 1; - return 0; -} -fs_initcall(clocksource_done_booting); #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static LIST_HEAD(watchdog_list); @@ -356,18 +338,16 @@ static inline void clocksource_resume_watchdog(void) { } void clocksource_resume(void) { struct clocksource *cs; - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) { + list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); - } clocksource_resume_watchdog(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } /** @@ -383,28 +363,13 @@ void clocksource_touch_watchdog(void) } #ifdef CONFIG_GENERIC_TIME -/** - * clocksource_get_next - Returns the selected clocksource - * - */ -struct clocksource *clocksource_get_next(void) -{ - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); - if (next_clocksource && finished_booting) { - curr_clocksource = next_clocksource; - next_clocksource = NULL; - } - spin_unlock_irqrestore(&clocksource_lock, flags); - - return curr_clocksource; -} +static int finished_booting; /** * clocksource_select - Select the best clocksource available * - * Private function. Must hold clocksource_lock when called. + * Private function. Must hold clocksource_mutex when called. * * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. @@ -413,7 +378,7 @@ static void clocksource_select(void) { struct clocksource *best, *cs; - if (list_empty(&clocksource_list)) + if (!finished_booting || list_empty(&clocksource_list)) return; /* First clocksource on the list has the best rating. */ best = list_first_entry(&clocksource_list, struct clocksource, list); @@ -438,13 +403,31 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) - next_clocksource = best; + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); + } } +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + clocksource_select(); + return 0; +} +fs_initcall(clocksource_done_booting); + #else /* CONFIG_GENERIC_TIME */ -static void clocksource_select(void) { } +static inline void clocksource_select(void) { } #endif @@ -471,13 +454,11 @@ static void clocksource_enqueue(struct clocksource *cs) */ int clocksource_register(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); return 0; } EXPORT_SYMBOL(clocksource_register); @@ -487,14 +468,12 @@ EXPORT_SYMBOL(clocksource_register); */ void clocksource_change_rating(struct clocksource *cs, int rating) { - unsigned long flags; - - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -503,13 +482,11 @@ EXPORT_SYMBOL(clocksource_change_rating); */ void clocksource_unregister(struct clocksource *cs) { - unsigned long flags; - + mutex_lock(&clocksource_mutex); clocksource_dequeue_watchdog(cs); - spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); clocksource_select(); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_unregister); @@ -527,9 +504,9 @@ sysfs_show_current_clocksources(struct sys_device *dev, { ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return count; } @@ -557,14 +534,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (buf[count-1] == '\n') count--; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; clocksource_select(); - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); return ret; } @@ -584,7 +561,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, struct clocksource *src; ssize_t count = 0; - spin_lock_irq(&clocksource_lock); + mutex_lock(&clocksource_mutex); list_for_each_entry(src, &clocksource_list, list) { /* * Don't show non-HRES clocksource if the tick code is @@ -596,7 +573,7 @@ sysfs_show_available_clocksources(struct sys_device *dev, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "%s ", src->name); } - spin_unlock_irq(&clocksource_lock); + mutex_unlock(&clocksource_mutex); count += snprintf(buf + count, max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); @@ -651,11 +628,10 @@ device_initcall(init_clocksource_sysfs); */ static int __init boot_override_clocksource(char* str) { - unsigned long flags; - spin_lock_irqsave(&clocksource_lock, flags); + mutex_lock(&clocksource_mutex); if (str) strlcpy(override_name, str, sizeof(override_name)); - spin_unlock_irqrestore(&clocksource_lock, flags); + mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 27ae01b596b7..41579e7fcf9d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Structure holding internal timekeeping values. */ struct timekeeper { @@ -179,6 +180,7 @@ void timekeeping_leap_insert(int leapsecond) } #ifdef CONFIG_GENERIC_TIME + /** * timekeeping_forward_now - update clock to the current time * @@ -351,31 +353,40 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static void change_clocksource(void) +static int change_clocksource(void *data) { struct clocksource *new, *old; - new = clocksource_get_next(); - - if (!new || timekeeper.clock == new) - return; + new = (struct clocksource *) data; timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} - if (new->enable && !new->enable(new)) +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) return; - - old = timekeeper.clock; - timekeeper_setup_internals(new); - - if (old->disable) - old->disable(old); - + stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); } + #else /* GENERIC_TIME */ + static inline void timekeeping_forward_now(void) { } -static inline void change_clocksource(void) { } /** * ktime_get - get the monotonic time in ktime_t format @@ -416,6 +427,7 @@ void ktime_get_ts(struct timespec *ts) ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); + #endif /* !GENERIC_TIME */ /** @@ -773,7 +785,6 @@ void update_wall_time(void) update_xtime_cache(nsecs); /* check to see if there is a new clocksource to use */ - change_clocksource(); update_vsyscall(&xtime, timekeeper.clock); } -- cgit v1.2.3-71-gd317 From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:31 +0200 Subject: timekeeping: Increase granularity of read_persistent_clock() The persistent clock of some architectures (e.g. s390) have a better granularity than seconds. To reduce the delta between the host clock and the guest clock in a virtualized system change the read_persistent_clock function to return a struct timespec. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.013873340@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/m68knommu/kernel/time.c | 5 ++-- arch/mips/dec/time.c | 5 ++-- arch/mips/lasat/ds1603.c | 5 ++-- arch/mips/lasat/sysctl.c | 8 ++++-- arch/mips/lemote/lm2e/setup.c | 5 ++-- arch/mips/mti-malta/malta-time.c | 5 ++-- arch/mips/pmc-sierra/yosemite/setup.c | 5 ++-- arch/mips/sibyte/swarm/setup.c | 15 +++++++--- arch/mips/sni/time.c | 5 ++-- arch/powerpc/kernel/time.c | 7 +++-- arch/s390/kernel/time.c | 22 +++------------ arch/sh/kernel/time.c | 6 ++-- arch/x86/kernel/rtc.c | 5 ++-- arch/xtensa/kernel/time.c | 5 ++-- include/linux/time.h | 2 +- kernel/time/timekeeping.c | 52 +++++++++++++++++++---------------- 16 files changed, 83 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d182b2f72211..68432248515c 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void) return mktime(year, mon, day, hour, min, sec);; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return read_rtc_mmss(); + ts->tv_sec = read_rtc_mmss(); + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c index 463136e6685a..02f505f23c32 100644 --- a/arch/mips/dec/time.c +++ b/arch/mips/dec/time.c @@ -18,7 +18,7 @@ #include #include -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, mon, day, hour, min, sec, real_year; unsigned long flags; @@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void) year += real_year - 72 + 2000; - return mktime(year, mon, day, hour, min, sec); + ts->tv_sec = mktime(year, mon, day, hour, min, sec); + ts->tv_nsec = 0; } /* diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c index 52cb1436a12a..c6fd96ff118d 100644 --- a/arch/mips/lasat/ds1603.c +++ b/arch/mips/lasat/ds1603.c @@ -135,7 +135,7 @@ static void rtc_end_op(void) lasat_ndelay(1000); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long word; unsigned long flags; @@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void) rtc_end_op(); spin_unlock_irqrestore(&rtc_lock, flags); - return word; + ts->tv_sec = word; + ts->tv_nsec = 0; } int rtc_mips_set_mmss(unsigned long time) diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 8f88886feb12..3f04d4c406b7 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -92,10 +92,12 @@ static int rtctmp; int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { + struct timespec ts; int r; if (!write) { - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; /* check for time < 0 and set to 0 */ if (rtctmp < 0) rtctmp = 0; @@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { + struct timespec ts; int r; - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; if (rtctmp < 0) rtctmp = 0; r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c index ebd6ceaef2fd..24b355df6127 100644 --- a/arch/mips/lemote/lm2e/setup.c +++ b/arch/mips/lemote/lm2e/setup.c @@ -54,9 +54,10 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_clock_freq / 2; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } void (*__wbflush)(void); diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 0b97d47691fc..3c6f190aa61c 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void) return count; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } static void __init plat_perf_setup(void) diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c index 2d3c0dca275d..3498ac9c35af 100644 --- a/arch/mips/pmc-sierra/yosemite/setup.c +++ b/arch/mips/pmc-sierra/yosemite/setup.c @@ -70,7 +70,7 @@ void __init bus_error_init(void) } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, month, day, hour, min, sec; unsigned long flags; @@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void) m48t37_base->control = 0x00; spin_unlock_irqrestore(&rtc_lock, flags); - return mktime(year, month, day, hour, min, sec); + ts->tv_sec = mktime(year, month, day, hour, min, sec); + ts->tv_nsec = 0; } int rtc_mips_set_time(unsigned long tim) diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 672e45d495a9..623ffc933c4c 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -87,19 +87,26 @@ enum swarm_rtc_type { enum swarm_rtc_type swarm_rtc_type; -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { + unsigned long sec; + switch (swarm_rtc_type) { case RTC_XICOR: - return xicor_get_time(); + sec = xicor_get_time(); + break; case RTC_M4LT81: - return m41t81_get_time(); + sec = m41t81_get_time(); + break; case RTC_NONE: default: - return mktime(2000, 1, 1, 0, 0, 0); + sec = mktime(2000, 1, 1, 0, 0, 0); + break; } + ts->tv_sec = sec; + tv->tv_nsec = 0; } int rtc_mips_set_time(unsigned long sec) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index 0d9ec1a5c24a..62df6a598e0a 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -182,7 +182,8 @@ void __init plat_time_init(void) setup_pit_timer(); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return -1; + ts->tv_sec = -1; + ts->tv_nsec = 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceeac..ad63f30fe3da 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now) return ppc_md.set_rtc_time(&tm); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { struct rtc_time tm; static int first = 1; @@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void) if (!ppc_md.get_rtc_time) return 0; ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_nsec = 0; } /* clocksource code */ diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e76c2e7a8b9a..a94ec48587b4 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code) static void etr_reset(void); static void stp_reset(void); -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec ts; - - tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts); - return ts.tv_sec; + tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } static cycle_t read_tod_clock(struct clocksource *cs) @@ -248,7 +245,6 @@ void __init time_init(void) { struct timespec ts; unsigned long flags; - cycle_t now; /* Reset time synchronization interfaces. */ etr_reset(); @@ -266,20 +262,10 @@ void __init time_init(void) panic("Could not register TOD clock source"); /* - * The TOD clock is an accurate clock. The xtime should be - * initialized in a way that the difference between TOD and - * xtime is reasonably small. Too bad that timekeeping_init - * sets xtime.tv_nsec to zero. In addition the clock source - * change from the jiffies clock source to the TOD clock - * source add another error of up to 1/HZ second. The same - * function sets wall_to_monotonic to a value that is too - * small for /proc/uptime to be accurate. - * Reset xtime and wall_to_monotonic to sane values. + * Reset wall_to_monotonic to the initial timestamp created + * in head.S to get a precise value in /proc/uptime. */ write_seqlock_irqsave(&xtime_lock, flags); - now = get_clock(); - tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); - clocksource_tod.cycle_last = now; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 9b352a1e3fb4..3f4706aa975e 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; #ifdef CONFIG_GENERIC_CMOS_UPDATE -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec tv; - rtc_sh_get_time(&tv); - return tv.tv_sec; + rtc_sh_get_time(&ts); } int update_persistent_clock(struct timespec now) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e72..bf67dcb4a44c 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime) } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; @@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void) retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; + ts->tv_sec = retval; + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 8848120d291b..19085ff0484a 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = { void __init time_init(void) { - xtime.tv_nsec = 0; - xtime.tv_sec = read_persistent_clock(); - + /* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */ + read_persistent_clock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); diff --git a/include/linux/time.h b/include/linux/time.h index e7c844558884..53a3216f0d1b 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -101,7 +101,7 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; -extern unsigned long read_persistent_clock(void); +extern void read_persistent_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 41579e7fcf9d..f1a21ce491e6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -507,7 +508,9 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now; + + read_persistent_clock(&now); write_seqlock_irqsave(&xtime_lock, flags); @@ -518,19 +521,20 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ @@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); @@ -801,9 +806,10 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime; + + boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time); + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -812,7 +818,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) -- cgit v1.2.3-71-gd317 From 23970e389e9cee43c4b41023935e1417271708b2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:32 +0200 Subject: timekeeping: Introduce read_boot_clock Add the new function read_boot_clock to get the exact time the system has been started. For architectures without support for exact boot time a new weak function is added that returns 0. Use the exact boot time to initialize wall_to_monotonic, or xtime if the read_boot_clock returned 0. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.296703241@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/s390/kernel/time.c | 17 +++++------------ include/linux/time.h | 1 + kernel/time/timekeeping.c | 24 ++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index a94ec48587b4..6bff1a1d9060 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -187,6 +187,11 @@ void read_persistent_clock(struct timespec *ts) tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } +void read_boot_clock(struct timespec *ts) +{ + tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); +} + static cycle_t read_tod_clock(struct clocksource *cs) { return get_clock(); @@ -243,9 +248,6 @@ void update_vsyscall_tz(void) */ void __init time_init(void) { - struct timespec ts; - unsigned long flags; - /* Reset time synchronization interfaces. */ etr_reset(); stp_reset(); @@ -261,15 +263,6 @@ void __init time_init(void) if (clocksource_register(&clocksource_tod) != 0) panic("Could not register TOD clock source"); - /* - * Reset wall_to_monotonic to the initial timestamp created - * in head.S to get a precise value in /proc/uptime. - */ - write_seqlock_irqsave(&xtime_lock, flags); - tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); - set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); - /* Enable TOD clock interrupts on the boot cpu. */ init_cpu_timer(); diff --git a/include/linux/time.h b/include/linux/time.h index 53a3216f0d1b..f505988398e6 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -102,6 +102,7 @@ extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; extern void read_persistent_clock(struct timespec *ts); +extern void read_boot_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f1a21ce491e6..15e06defca55 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -501,6 +501,21 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -508,9 +523,10 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - struct timespec now; + struct timespec now, boot; read_persistent_clock(&now); + read_boot_clock(&boot); write_seqlock_irqsave(&xtime_lock, flags); @@ -525,8 +541,12 @@ void __init timekeeping_init(void) xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + -boot.tv_sec, -boot.tv_nsec); update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; -- cgit v1.2.3-71-gd317 From 0ccff1a49def92d6b838a6da166c89004b3a4d0c Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Mon, 17 Aug 2009 22:38:04 -0400 Subject: jbd2: bitfields should be unsigned This fixes sparse noise: error: dubious one-bit signed bitfield Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d97eb652d6ca..52695d3dfd0b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -652,7 +652,7 @@ struct transaction_s * This transaction is being forced and some process is * waiting for it to finish. */ - int t_synchronous_commit:1; + unsigned int t_synchronous_commit:1; /* * For use by the filesystem to store fs-specific data -- cgit v1.2.3-71-gd317 From fc69f4a6af49ee69475dc4217924d9edf77760e0 Mon Sep 17 00:00:00 2001 From: Tai-hwa Liang Date: Sun, 10 May 2009 18:15:39 -0700 Subject: Input: add new driver for Sentelic Finger Sensing Pad This is the driver for Sentelic Finger Sensing Pad which can be found on MSI WIND Netbook. Signed-off-by: Tai-hwa Liang Signed-off-by: Dmitry Torokhov --- Documentation/input/sentelic.txt | 475 ++++++++++++++++++++ drivers/input/mouse/Kconfig | 8 + drivers/input/mouse/Makefile | 1 + drivers/input/mouse/psmouse-base.c | 26 +- drivers/input/mouse/psmouse.h | 1 + drivers/input/mouse/sentelic.c | 867 +++++++++++++++++++++++++++++++++++++ drivers/input/mouse/sentelic.h | 98 +++++ drivers/input/serio/libps2.c | 15 +- include/linux/libps2.h | 1 + 9 files changed, 1488 insertions(+), 4 deletions(-) create mode 100644 Documentation/input/sentelic.txt create mode 100644 drivers/input/mouse/sentelic.c create mode 100644 drivers/input/mouse/sentelic.h (limited to 'include/linux') diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt new file mode 100644 index 000000000000..f7160a2fb6a2 --- /dev/null +++ b/Documentation/input/sentelic.txt @@ -0,0 +1,475 @@ +Copyright (C) 2002-2008 Sentelic Corporation. +Last update: Oct-31-2008 + +============================================================================== +* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons) +============================================================================== +A) MSID 4: Scrolling wheel mode plus Forward page(4th button) and Backward + page (5th button) +@1. Set sample rate to 200; +@2. Set sample rate to 200; +@3. Set sample rate to 80; +@4. Issuing the "Get device ID" command (0xF2) and waits for the response; +@5. FSP will respond 0x04. + +Packet 1 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|W|W|W|W| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7 => Y overflow + Bit6 => X overflow + Bit5 => Y sign bit + Bit4 => X sign bit + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. +Byte 2: X Movement(9-bit 2's complement integers) +Byte 3: Y Movement(9-bit 2's complement integers) +Byte 4: Bit3~Bit0 => the scrolling wheel's movement since the last data report. + valid values, -8 ~ +7 + Bit4 => 1 = 4th mouse button is pressed, Forward one page. + 0 = 4th mouse button is not pressed. + Bit5 => 1 = 5th mouse button is pressed, Backward one page. + 0 = 5th mouse button is not pressed. + +B) MSID 6: Horizontal and Vertical scrolling. +@ Set bit 1 in register 0x40 to 1 + +# FSP replaces scrolling wheel's movement as 4 bits to show horizontal and + vertical scrolling. + +Packet 1 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|l|r|u|d| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7 => Y overflow + Bit6 => X overflow + Bit5 => Y sign bit + Bit4 => X sign bit + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. +Byte 2: X Movement(9-bit 2's complement integers) +Byte 3: Y Movement(9-bit 2's complement integers) +Byte 4: Bit0 => the Vertical scrolling movement downward. + Bit1 => the Vertical scrolling movement upward. + Bit2 => the Vertical scrolling movement rightward. + Bit3 => the Vertical scrolling movement leftward. + Bit4 => 1 = 4th mouse button is pressed, Forward one page. + 0 = 4th mouse button is not pressed. + Bit5 => 1 = 5th mouse button is pressed, Backward one page. + 0 = 5th mouse button is not pressed. + +C) MSID 7: +# FSP uses 2 packets(8 Bytes) data to represent Absolute Position + so we have PACKET NUMBER to identify packets. + If PACKET NUMBER is 0, the packet is Packet 1. + If PACKET NUMBER is 1, the packet is Packet 2. + Please count this number in program. + +# MSID6 special packet will be enable at the same time when enable MSID 7. + +============================================================================== +* Absolute position for STL3886-G0. +============================================================================== +@ Set bit 2 or 3 in register 0x40 to 1 +@ Set bit 6 in register 0x40 to 1 + +Packet 1 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|1|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|d|u|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => valid bit + Bit4 => 1 + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. +Byte 2: X coordinate (xpos[9:2]) +Byte 3: Y coordinate (ypos[9:2]) +Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => scroll up + Bit5 => scroll down + Bit6 => scroll left + Bit7 => scroll right + +Notify Packet for G0 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|0|1|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |M|M|M|M|M|M|M|M| 4 |0|0|0|0|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => 0 + Bit4 => 1 + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. +Byte 2: Message Type => 0x5A (Enable/Disable status packet) + Mode Type => 0xA5 (Normal/Icon mode status) +Byte 3: Message Type => 0x00 (Disabled) + => 0x01 (Enabled) + Mode Type => 0x00 (Normal) + => 0x01 (Icon) +Byte 4: Bit7~Bit0 => Don't Care + +============================================================================== +* Absolute position for STL3888-A0. +============================================================================== +Packet 1 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|A|1|L|0|1| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => arc + Bit3 => 1 + Bit2 => Left Button, 1 is pressed, 0 is released. + Bit1 => 0 + Bit0 => 1 +Byte 2: X coordinate (xpos[9:2]) +Byte 3: Y coordinate (ypos[9:2]) +Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit5~Bit4 => y1_g + Bit7~Bit6 => x1_g + +Packet 2 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|A|1|R|1|0| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => arc + Bit3 => 1 + Bit2 => Right Button, 1 is pressed, 0 is released. + Bit1 => 1 + Bit0 => 0 +Byte 2: X coordinate (xpos[9:2]) +Byte 3: Y coordinate (ypos[9:2]) +Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit5~Bit4 => y2_g + Bit7~Bit6 => x2_g + +Notify Packet for STL3888-A0 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + +Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => 1 + Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): + 0: left button is generated by the on-pad command + 1: left button is generated by the external button + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. +Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) +Byte 3: Bit7~Bit6 => Don't care + Bit5~Bit4 => Number of fingers + Bit3~Bit1 => Reserved + Bit0 => 1: enter gesture mode; 0: leaving gesture mode +Byte 4: Bit7 => scroll right button + Bit6 => scroll left button + Bit5 => scroll down button + Bit4 => scroll up button + * Note that if gesture and additional button (Bit4~Bit7) + happen at the same time, the button information will not + be sent. + Bit3~Bit0 => Reserved + +Sample sequence of Multi-finger, Multi-coordinate mode: + + notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, + abs pkt 2, ..., notify packet(valid bit == 0) + +============================================================================== +* FSP Enable/Disable packet +============================================================================== + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|0|0|1|M|R|L| 2 |0|1|0|1|1|0|1|E| 3 | | | | | | | | | 4 | | | | | | | | | + |---------------| |---------------| |---------------| |---------------| + +FSP will send out enable/disable packet when FSP receive PS/2 enable/disable +command. Host will receive the packet which Middle, Right, Left button will +be set. The packet only use byte 0 and byte 1 as a pattern of original packet. +Ignore the other bytes of the packet. + +Byte 1: Bit7 => 0, Y overflow + Bit6 => 0, X overflow + Bit5 => 0, Y sign bit + Bit4 => 0, X sign bit + Bit3 => 1 + Bit2 => 1, Middle Button + Bit1 => 1, Right Button + Bit0 => 1, Left Button +Byte 2: Bit7~1 => (0101101b) + Bit0 => 1 = Enable + 0 = Disable +Byte 3: Don't care +Byte 4: Don't care (MOUSE ID 3, 4) +Byte 5~8: Don't care (Absolute packet) + +============================================================================== +* PS/2 Command Set +============================================================================== + +FSP supports basic PS/2 commanding set and modes, refer to following URL for +details about PS/2 commands: + +http://www.computer-engineering.org/index.php?title=PS/2_Mouse_Interface + +============================================================================== +* Programming Sequence for Determining Packet Parsing Flow +============================================================================== +1. Identify FSP by reading device ID(0x00) and version(0x01) register + +2. Determine number of buttons by reading status2 (0x0b) register + + buttons = reg[0x0b] & 0x30 + + if buttons == 0x30 or buttons == 0x20: + # two/four buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section A for packet parsing detail(ignore byte 4, bit ~ 7) + elif buttons == 0x10: + # 6 buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section B for packet parsing detail + elif buttons == 0x00: + # 6 buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section A for packet parsing detail + +============================================================================== +* Programming Sequence for Register Reading/Writing +============================================================================== + +Register inversion requirement: + + Following values needed to be inverted(the '~' operator in C) before being +sent to FSP: + + 0xe9, 0xee, 0xf2 and 0xff. + +Register swapping requirement: + + Following values needed to have their higher 4 bits and lower 4 bits being +swapped before being sent to FSP: + + 10, 20, 40, 60, 80, 100 and 200. + +Register reading sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. send 0x66 PS/2 command to FSP; + + 3. send 0x88 PS/2 command to FSP; + + 4. send 0xf3 PS/2 command to FSP; + + 5. if the register address being to read is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 6 + + 5a. send 0x68 PS/2 command to FSP; + + 5b. send the inverted register address to FSP and goto step 8; + + 6. if the register address being to read is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 7 + + 6a. send 0xcc PS/2 command to FSP; + + 6b. send the swapped register address to FSP and goto step 8; + + 7. send 0x66 PS/2 command to FSP; + + 7a. send the original register address to FSP and goto step 8; + + 8. send 0xe9(status request) PS/2 command to FSP; + + 9. the response read from FSP should be the requested register value. + +Register writing sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. if the register address being to write is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 3 + + 2a. send 0x74 PS/2 command to FSP; + + 2b. send the inverted register address to FSP and goto step 5; + + 3. if the register address being to write is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 4 + + 3a. send 0x77 PS/2 command to FSP; + + 3b. send the swapped register address to FSP and goto step 5; + + 4. send 0x55 PS/2 command to FSP; + + 4a. send the register address to FSP and goto step 5; + + 5. send 0xf3 PS/2 command to FSP; + + 6. if the register value being to write is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 7 + + 6a. send 0x47 PS/2 command to FSP; + + 6b. send the inverted register value to FSP and goto step 9; + + 7. if the register value being to write is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 8 + + 7a. send 0x44 PS/2 command to FSP; + + 7b. send the swapped register value to FSP and goto step 9; + + 8. send 0x33 PS/2 command to FSP; + + 8a. send the register value to FSP; + + 9. the register writing sequence is completed. + +============================================================================== +* Register Listing +============================================================================== + +offset width default r/w name +0x00 bit7~bit0 0x01 RO device ID + +0x01 bit7~bit0 0xc0 RW version ID + +0x02 bit7~bit0 0x01 RO vendor ID + +0x03 bit7~bit0 0x01 RO product ID + +0x04 bit3~bit0 0x01 RW revision ID + +0x0b RO test mode status 1 + bit3 1 RO 0: rotate 180 degree, 1: no rotation + + bit5~bit4 RO number of buttons + 11 => 2, lbtn/rbtn + 10 => 4, lbtn/rbtn/scru/scrd + 01 => 6, lbtn/rbtn/scru/scrd/scrl/scrr + 00 => 6, lbtn/rbtn/scru/scrd/fbtn/bbtn + +0x0f RW register file page control + bit0 0 RW 1 to enable page 1 register files + +0x10 RW system control 1 + bit0 1 RW Reserved, must be 1 + bit1 0 RW Reserved, must be 0 + bit4 1 RW Reserved, must be 0 + bit5 0 RW register clock gating enable + 0: read only, 1: read/write enable + (Note that following registers does not require clock gating being + enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e + 40 41 42 43.) + +0x31 RW on-pad command detection + bit7 0 RW on-pad command left button down tag + enable + 0: disable, 1: enable + +0x34 RW on-pad command control 5 + bit4~bit0 0x05 RW XLO in 0s/4/1, so 03h = 0010.1b = 2.5 + (Note that position unit is in 0.5 scanline) + + bit7 0 RW on-pad tap zone enable + 0: disable, 1: enable + +0x35 RW on-pad command control 6 + bit4~bit0 0x1d RW XHI in 0s/4/1, so 19h = 1100.1b = 12.5 + (Note that position unit is in 0.5 scanline) + +0x36 RW on-pad command control 7 + bit4~bit0 0x04 RW YLO in 0s/4/1, so 03h = 0010.1b = 2.5 + (Note that position unit is in 0.5 scanline) + +0x37 RW on-pad command control 8 + bit4~bit0 0x13 RW YHI in 0s/4/1, so 11h = 1000.1b = 8.5 + (Note that position unit is in 0.5 scanline) + +0x40 RW system control 5 + bit1 0 RW FSP Intellimouse mode enable + 0: disable, 1: enable + + bit2 0 RW movement + abs. coordinate mode enable + 0: disable, 1: enable + (Note that this function has the functionality of bit 1 even when + bit 1 is not set. However, the format is different from that of bit 1. + In addition, when bit 1 and bit 2 are set at the same time, bit 2 will + override bit 1.) + + bit3 0 RW abs. coordinate only mode enable + 0: disable, 1: enable + (Note that this function has the functionality of bit 1 even when + bit 1 is not set. However, the format is different from that of bit 1. + In addition, when bit 1, bit 2 and bit 3 are set at the same time, + bit 3 will override bit 1 and 2.) + + bit5 0 RW auto switch enable + 0: disable, 1: enable + + bit6 0 RW G0 abs. + notify packet format enable + 0: disable, 1: enable + (Note that the absolute/relative coordinate output still depends on + bit 2 and 3. That is, if any of those bit is 1, host will receive + absolute coordinates; otherwise, host only receives packets with + relative coordinate.) + +0x43 RW on-pad control + bit0 0 RW on-pad control enable + 0: disable, 1: enable + (Note that if this bit is cleared, bit 3/5 will be ineffective) + + bit3 0 RW on-pad fix vertical scrolling enable + 0: disable, 1: enable + + bit5 0 RW on-pad fix horizontal scrolling enable + 0: disable, 1: enable diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig index 90bef5d498f0..3feeb3af8abd 100644 --- a/drivers/input/mouse/Kconfig +++ b/drivers/input/mouse/Kconfig @@ -107,6 +107,14 @@ config MOUSE_PS2_ELANTECH entries. For further information, see . +config MOUSE_PS2_SENTELIC + bool "Sentelic Finger Sensing Pad PS/2 protocol extension" + depends on MOUSE_PS2 + help + Say Y here if you have a laptop (such as MSI WIND Netbook) + with Sentelic Finger Sensing Pad touchpad. + + If unsure, say N. config MOUSE_PS2_TOUCHKIT bool "eGalax TouchKit PS/2 protocol extension" diff --git a/drivers/input/mouse/Makefile b/drivers/input/mouse/Makefile index ea58c9a372b6..570c84a4a654 100644 --- a/drivers/input/mouse/Makefile +++ b/drivers/input/mouse/Makefile @@ -27,5 +27,6 @@ psmouse-$(CONFIG_MOUSE_PS2_ELANTECH) += elantech.o psmouse-$(CONFIG_MOUSE_PS2_OLPC) += hgpk.o psmouse-$(CONFIG_MOUSE_PS2_LOGIPS2PP) += logips2pp.o psmouse-$(CONFIG_MOUSE_PS2_LIFEBOOK) += lifebook.o +psmouse-$(CONFIG_MOUSE_PS2_SENTELIC) += sentelic.o psmouse-$(CONFIG_MOUSE_PS2_TRACKPOINT) += trackpoint.o psmouse-$(CONFIG_MOUSE_PS2_TOUCHKIT) += touchkit_ps2.o diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c index b407b355dceb..df318887ca09 100644 --- a/drivers/input/mouse/psmouse-base.c +++ b/drivers/input/mouse/psmouse-base.c @@ -30,6 +30,7 @@ #include "trackpoint.h" #include "touchkit_ps2.h" #include "elantech.h" +#include "sentelic.h" #define DRIVER_DESC "PS/2 mouse driver" @@ -666,6 +667,20 @@ static int psmouse_extensions(struct psmouse *psmouse, max_proto = PSMOUSE_IMEX; } +/* + * Try Finger Sensing Pad + */ + if (max_proto > PSMOUSE_IMEX) { + if (fsp_detect(psmouse, set_properties) == 0) { + if (!set_properties || fsp_init(psmouse) == 0) + return PSMOUSE_FSP; +/* + * Init failed, try basic relative protocols + */ + max_proto = PSMOUSE_IMEX; + } + } + if (max_proto > PSMOUSE_IMEX) { if (genius_detect(psmouse, set_properties) == 0) return PSMOUSE_GENPS; @@ -813,7 +828,16 @@ static const struct psmouse_protocol psmouse_protocols[] = { .detect = elantech_detect, .init = elantech_init, }, - #endif +#endif +#ifdef CONFIG_MOUSE_PS2_SENTELIC + { + .type = PSMOUSE_FSP, + .name = "FSPPS/2", + .alias = "fsp", + .detect = fsp_detect, + .init = fsp_init, + }, +#endif { .type = PSMOUSE_CORTRON, .name = "CortronPS/2", diff --git a/drivers/input/mouse/psmouse.h b/drivers/input/mouse/psmouse.h index e3562daeb2ed..cca1744c2a08 100644 --- a/drivers/input/mouse/psmouse.h +++ b/drivers/input/mouse/psmouse.h @@ -91,6 +91,7 @@ enum psmouse_type { PSMOUSE_CORTRON, PSMOUSE_HGPK, PSMOUSE_ELANTECH, + PSMOUSE_FSP, PSMOUSE_AUTO /* This one should always be last */ }; diff --git a/drivers/input/mouse/sentelic.c b/drivers/input/mouse/sentelic.c new file mode 100644 index 000000000000..97b1e72855a0 --- /dev/null +++ b/drivers/input/mouse/sentelic.c @@ -0,0 +1,867 @@ +/*- + * Finger Sensing Pad PS/2 mouse driver. + * + * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. + * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "psmouse.h" +#include "sentelic.h" + +/* + * Timeout for FSP PS/2 command only (in milliseconds). + */ +#define FSP_CMD_TIMEOUT 200 +#define FSP_CMD_TIMEOUT2 30 + +/** Driver version. */ +static const char fsp_drv_ver[] = "1.0.0-K"; + +/* + * Make sure that the value being sent to FSP will not conflict with + * possible sample rate values. + */ +static unsigned char fsp_test_swap_cmd(unsigned char reg_val) +{ + switch (reg_val) { + case 10: case 20: case 40: case 60: case 80: case 100: case 200: + /* + * The requested value being sent to FSP matched to possible + * sample rates, swap the given value such that the hardware + * wouldn't get confused. + */ + return (reg_val >> 4) | (reg_val << 4); + default: + return reg_val; /* swap isn't necessary */ + } +} + +/* + * Make sure that the value being sent to FSP will not conflict with certain + * commands. + */ +static unsigned char fsp_test_invert_cmd(unsigned char reg_val) +{ + switch (reg_val) { + case 0xe9: case 0xee: case 0xf2: case 0xff: + /* + * The requested value being sent to FSP matched to certain + * commands, inverse the given value such that the hardware + * wouldn't get confused. + */ + return ~reg_val; + default: + return reg_val; /* inversion isn't necessary */ + } +} + +static int fsp_reg_read(struct psmouse *psmouse, int reg_addr, int *reg_val) +{ + struct ps2dev *ps2dev = &psmouse->ps2dev; + unsigned char param[3]; + unsigned char addr; + int rc = -1; + + /* + * We need to shut off the device and switch it into command + * mode so we don't confuse our protocol handler. We don't need + * to do that for writes because sysfs set helper does this for + * us. + */ + ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE); + psmouse_set_state(psmouse, PSMOUSE_CMD_MODE); + mutex_lock(&ps2dev->cmd_mutex); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + /* should return 0xfe(request for resending) */ + ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2); + /* should return 0xfc(failed) */ + ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + if ((addr = fsp_test_invert_cmd(reg_addr)) != reg_addr) { + ps2_sendbyte(ps2dev, 0x68, FSP_CMD_TIMEOUT2); + } else if ((addr = fsp_test_swap_cmd(reg_addr)) != reg_addr) { + /* swapping is required */ + ps2_sendbyte(ps2dev, 0xcc, FSP_CMD_TIMEOUT2); + /* expect 0xfe */ + } else { + /* swapping isn't necessary */ + ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2); + /* expect 0xfe */ + } + /* should return 0xfc(failed) */ + ps2_sendbyte(ps2dev, addr, FSP_CMD_TIMEOUT); + + if (__ps2_command(ps2dev, param, PSMOUSE_CMD_GETINFO) < 0) + goto out; + + *reg_val = param[2]; + rc = 0; + + out: + mutex_unlock(&ps2dev->cmd_mutex); + ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE); + psmouse_set_state(psmouse, PSMOUSE_ACTIVATED); + dev_dbg(&ps2dev->serio->dev, "READ REG: 0x%02x is 0x%02x (rc = %d)\n", + reg_addr, *reg_val, rc); + return rc; +} + +static int fsp_reg_write(struct psmouse *psmouse, int reg_addr, int reg_val) +{ + struct ps2dev *ps2dev = &psmouse->ps2dev; + unsigned char v; + int rc = -1; + + mutex_lock(&ps2dev->cmd_mutex); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + if ((v = fsp_test_invert_cmd(reg_addr)) != reg_addr) { + /* inversion is required */ + ps2_sendbyte(ps2dev, 0x74, FSP_CMD_TIMEOUT2); + } else { + if ((v = fsp_test_swap_cmd(reg_addr)) != reg_addr) { + /* swapping is required */ + ps2_sendbyte(ps2dev, 0x77, FSP_CMD_TIMEOUT2); + } else { + /* swapping isn't necessary */ + ps2_sendbyte(ps2dev, 0x55, FSP_CMD_TIMEOUT2); + } + } + /* write the register address in correct order */ + ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + return -1; + + if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { + /* inversion is required */ + ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2); + } else if ((v = fsp_test_swap_cmd(reg_val)) != reg_val) { + /* swapping is required */ + ps2_sendbyte(ps2dev, 0x44, FSP_CMD_TIMEOUT2); + } else { + /* swapping isn't necessary */ + ps2_sendbyte(ps2dev, 0x33, FSP_CMD_TIMEOUT2); + } + + /* write the register value in correct order */ + ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2); + rc = 0; + + out: + mutex_unlock(&ps2dev->cmd_mutex); + dev_dbg(&ps2dev->serio->dev, "WRITE REG: 0x%02x to 0x%02x (rc = %d)\n", + reg_addr, reg_val, rc); + return rc; +} + +/* Enable register clock gating for writing certain registers */ +static int fsp_reg_write_enable(struct psmouse *psmouse, bool enable) +{ + int v, nv; + + if (fsp_reg_read(psmouse, FSP_REG_SYSCTL1, &v) == -1) + return -1; + + if (enable) + nv = v | FSP_BIT_EN_REG_CLK; + else + nv = v & ~FSP_BIT_EN_REG_CLK; + + /* only write if necessary */ + if (nv != v) + if (fsp_reg_write(psmouse, FSP_REG_SYSCTL1, nv) == -1) + return -1; + + return 0; +} + +static int fsp_page_reg_read(struct psmouse *psmouse, int *reg_val) +{ + struct ps2dev *ps2dev = &psmouse->ps2dev; + unsigned char param[3]; + int rc = -1; + + ps2_command(ps2dev, NULL, PSMOUSE_CMD_DISABLE); + psmouse_set_state(psmouse, PSMOUSE_CMD_MODE); + mutex_lock(&ps2dev->cmd_mutex); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + ps2_sendbyte(ps2dev, 0x66, FSP_CMD_TIMEOUT2); + ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + ps2_sendbyte(ps2dev, 0x83, FSP_CMD_TIMEOUT2); + ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); + + /* get the returned result */ + if (__ps2_command(ps2dev, param, PSMOUSE_CMD_GETINFO)) + goto out; + + *reg_val = param[2]; + rc = 0; + + out: + mutex_unlock(&ps2dev->cmd_mutex); + ps2_command(ps2dev, NULL, PSMOUSE_CMD_ENABLE); + psmouse_set_state(psmouse, PSMOUSE_ACTIVATED); + dev_dbg(&ps2dev->serio->dev, "READ PAGE REG: 0x%02x (rc = %d)\n", + *reg_val, rc); + return rc; +} + +static int fsp_page_reg_write(struct psmouse *psmouse, int reg_val) +{ + struct ps2dev *ps2dev = &psmouse->ps2dev; + unsigned char v; + int rc = -1; + + mutex_lock(&ps2dev->cmd_mutex); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + goto out; + + ps2_sendbyte(ps2dev, 0x38, FSP_CMD_TIMEOUT2); + ps2_sendbyte(ps2dev, 0x88, FSP_CMD_TIMEOUT2); + + if (ps2_sendbyte(ps2dev, 0xf3, FSP_CMD_TIMEOUT) < 0) + return -1; + + if ((v = fsp_test_invert_cmd(reg_val)) != reg_val) { + ps2_sendbyte(ps2dev, 0x47, FSP_CMD_TIMEOUT2); + } else if ((v = fsp_test_swap_cmd(reg_val)) != reg_val) { + /* swapping is required */ + ps2_sendbyte(ps2dev, 0x44, FSP_CMD_TIMEOUT2); + } else { + /* swapping isn't necessary */ + ps2_sendbyte(ps2dev, 0x33, FSP_CMD_TIMEOUT2); + } + + ps2_sendbyte(ps2dev, v, FSP_CMD_TIMEOUT2); + rc = 0; + + out: + mutex_unlock(&ps2dev->cmd_mutex); + dev_dbg(&ps2dev->serio->dev, "WRITE PAGE REG: to 0x%02x (rc = %d)\n", + reg_val, rc); + return rc; +} + +static int fsp_get_version(struct psmouse *psmouse, int *version) +{ + if (fsp_reg_read(psmouse, FSP_REG_VERSION, version)) + return -EIO; + + return 0; +} + +static int fsp_get_revision(struct psmouse *psmouse, int *rev) +{ + if (fsp_reg_read(psmouse, FSP_REG_REVISION, rev)) + return -EIO; + + return 0; +} + +static int fsp_get_buttons(struct psmouse *psmouse, int *btn) +{ + static const int buttons[] = { + 0x16, /* Left/Middle/Right/Forward/Backward & Scroll Up/Down */ + 0x06, /* Left/Middle/Right & Scroll Up/Down/Right/Left */ + 0x04, /* Left/Middle/Right & Scroll Up/Down */ + 0x02, /* Left/Middle/Right */ + }; + int val; + + if (fsp_reg_read(psmouse, FSP_REG_TMOD_STATUS1, &val) == -1) + return -EIO; + + *btn = buttons[(val & 0x30) >> 4]; + return 0; +} + +/* Enable on-pad command tag output */ +static int fsp_opc_tag_enable(struct psmouse *psmouse, bool enable) +{ + int v, nv; + int res = 0; + + if (fsp_reg_read(psmouse, FSP_REG_OPC_QDOWN, &v) == -1) { + dev_err(&psmouse->ps2dev.serio->dev, "Unable get OPC state.\n"); + return -EIO; + } + + if (enable) + nv = v | FSP_BIT_EN_OPC_TAG; + else + nv = v & ~FSP_BIT_EN_OPC_TAG; + + /* only write if necessary */ + if (nv != v) { + fsp_reg_write_enable(psmouse, true); + res = fsp_reg_write(psmouse, FSP_REG_OPC_QDOWN, nv); + fsp_reg_write_enable(psmouse, false); + } + + if (res != 0) { + dev_err(&psmouse->ps2dev.serio->dev, + "Unable to enable OPC tag.\n"); + res = -EIO; + } + + return res; +} + +static int fsp_onpad_vscr(struct psmouse *psmouse, bool enable) +{ + struct fsp_data *pad = psmouse->private; + int val; + + if (fsp_reg_read(psmouse, FSP_REG_ONPAD_CTL, &val)) + return -EIO; + + pad->vscroll = enable; + + if (enable) + val |= (FSP_BIT_FIX_VSCR | FSP_BIT_ONPAD_ENABLE); + else + val &= ~FSP_BIT_FIX_VSCR; + + if (fsp_reg_write(psmouse, FSP_REG_ONPAD_CTL, val)) + return -EIO; + + return 0; +} + +static int fsp_onpad_hscr(struct psmouse *psmouse, bool enable) +{ + struct fsp_data *pad = psmouse->private; + int val, v2; + + if (fsp_reg_read(psmouse, FSP_REG_ONPAD_CTL, &val)) + return -EIO; + + if (fsp_reg_read(psmouse, FSP_REG_SYSCTL5, &v2)) + return -EIO; + + pad->hscroll = enable; + + if (enable) { + val |= (FSP_BIT_FIX_HSCR | FSP_BIT_ONPAD_ENABLE); + v2 |= FSP_BIT_EN_MSID6; + } else { + val &= ~FSP_BIT_FIX_HSCR; + v2 &= ~(FSP_BIT_EN_MSID6 | FSP_BIT_EN_MSID7 | FSP_BIT_EN_MSID8); + } + + if (fsp_reg_write(psmouse, FSP_REG_ONPAD_CTL, val)) + return -EIO; + + /* reconfigure horizontal scrolling packet output */ + if (fsp_reg_write(psmouse, FSP_REG_SYSCTL5, v2)) + return -EIO; + + return 0; +} + +/* + * Write device specific initial parameters. + * + * ex: 0xab 0xcd - write oxcd into register 0xab + */ +static ssize_t fsp_attr_set_setreg(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + unsigned long reg, val; + char *rest; + ssize_t retval; + + reg = simple_strtoul(buf, &rest, 16); + if (rest == buf || *rest != ' ' || reg > 0xff) + return -EINVAL; + + if (strict_strtoul(rest + 1, 16, &val) || val > 0xff) + return -EINVAL; + + if (fsp_reg_write_enable(psmouse, true)) + return -EIO; + + retval = fsp_reg_write(psmouse, reg, val) < 0 ? -EIO : count; + + fsp_reg_write_enable(psmouse, false); + + return count; +} + +PSMOUSE_DEFINE_WO_ATTR(setreg, S_IWUSR, NULL, fsp_attr_set_setreg); + +static ssize_t fsp_attr_show_getreg(struct psmouse *psmouse, + void *data, char *buf) +{ + struct fsp_data *pad = psmouse->private; + + return sprintf(buf, "%02x%02x\n", pad->last_reg, pad->last_val); +} + +/* + * Read a register from device. + * + * ex: 0xab -- read content from register 0xab + */ +static ssize_t fsp_attr_set_getreg(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + struct fsp_data *pad = psmouse->private; + unsigned long reg; + int val; + + if (strict_strtoul(buf, 16, ®) || reg > 0xff) + return -EINVAL; + + if (fsp_reg_read(psmouse, reg, &val)) + return -EIO; + + pad->last_reg = reg; + pad->last_val = val; + + return count; +} + +PSMOUSE_DEFINE_ATTR(getreg, S_IWUSR | S_IRUGO, NULL, + fsp_attr_show_getreg, fsp_attr_set_getreg); + +static ssize_t fsp_attr_show_pagereg(struct psmouse *psmouse, + void *data, char *buf) +{ + int val = 0; + + if (fsp_page_reg_read(psmouse, &val)) + return -EIO; + + return sprintf(buf, "%02x\n", val); +} + +static ssize_t fsp_attr_set_pagereg(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + unsigned long val; + + if (strict_strtoul(buf, 16, &val) || val > 0xff) + return -EINVAL; + + if (fsp_page_reg_write(psmouse, val)) + return -EIO; + + return count; +} + +PSMOUSE_DEFINE_ATTR(page, S_IWUSR | S_IRUGO, NULL, + fsp_attr_show_pagereg, fsp_attr_set_pagereg); + +static ssize_t fsp_attr_show_vscroll(struct psmouse *psmouse, + void *data, char *buf) +{ + struct fsp_data *pad = psmouse->private; + + return sprintf(buf, "%d\n", pad->vscroll); +} + +static ssize_t fsp_attr_set_vscroll(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + unsigned long val; + + if (strict_strtoul(buf, 10, &val) || val > 1) + return -EINVAL; + + fsp_onpad_vscr(psmouse, val); + + return count; +} + +PSMOUSE_DEFINE_ATTR(vscroll, S_IWUSR | S_IRUGO, NULL, + fsp_attr_show_vscroll, fsp_attr_set_vscroll); + +static ssize_t fsp_attr_show_hscroll(struct psmouse *psmouse, + void *data, char *buf) +{ + struct fsp_data *pad = psmouse->private; + + return sprintf(buf, "%d\n", pad->hscroll); +} + +static ssize_t fsp_attr_set_hscroll(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + unsigned long val; + + if (strict_strtoul(buf, 10, &val) || val > 1) + return -EINVAL; + + fsp_onpad_hscr(psmouse, val); + + return count; +} + +PSMOUSE_DEFINE_ATTR(hscroll, S_IWUSR | S_IRUGO, NULL, + fsp_attr_show_hscroll, fsp_attr_set_hscroll); + +static ssize_t fsp_attr_show_flags(struct psmouse *psmouse, + void *data, char *buf) +{ + struct fsp_data *pad = psmouse->private; + + return sprintf(buf, "%c\n", + pad->flags & FSPDRV_FLAG_EN_OPC ? 'C' : 'c'); +} + +static ssize_t fsp_attr_set_flags(struct psmouse *psmouse, void *data, + const char *buf, size_t count) +{ + struct fsp_data *pad = psmouse->private; + size_t i; + + for (i = 0; i < count; i++) { + switch (buf[i]) { + case 'C': + pad->flags |= FSPDRV_FLAG_EN_OPC; + break; + case 'c': + pad->flags &= ~FSPDRV_FLAG_EN_OPC; + break; + default: + return -EINVAL; + } + } + return count; +} + +PSMOUSE_DEFINE_ATTR(flags, S_IWUSR | S_IRUGO, NULL, + fsp_attr_show_flags, fsp_attr_set_flags); + +static ssize_t fsp_attr_show_ver(struct psmouse *psmouse, + void *data, char *buf) +{ + return sprintf(buf, "Sentelic FSP kernel module %s\n", fsp_drv_ver); +} + +PSMOUSE_DEFINE_RO_ATTR(ver, S_IRUGO, NULL, fsp_attr_show_ver); + +static struct attribute *fsp_attributes[] = { + &psmouse_attr_setreg.dattr.attr, + &psmouse_attr_getreg.dattr.attr, + &psmouse_attr_page.dattr.attr, + &psmouse_attr_vscroll.dattr.attr, + &psmouse_attr_hscroll.dattr.attr, + &psmouse_attr_flags.dattr.attr, + &psmouse_attr_ver.dattr.attr, + NULL +}; + +static struct attribute_group fsp_attribute_group = { + .attrs = fsp_attributes, +}; + +#ifdef FSP_DEBUG +static void fsp_packet_debug(unsigned char packet[]) +{ + static unsigned int ps2_packet_cnt; + static unsigned int ps2_last_second; + unsigned int jiffies_msec; + + ps2_packet_cnt++; + jiffies_msec = jiffies_to_msecs(jiffies); + printk(KERN_DEBUG "%08dms PS/2 packets: %02x, %02x, %02x, %02x\n", + jiffies_msec, packet[0], packet[1], packet[2], packet[3]); + + if (jiffies_msec - ps2_last_second > 1000) { + printk(KERN_DEBUG "PS/2 packets/sec = %d\n", ps2_packet_cnt); + ps2_packet_cnt = 0; + ps2_last_second = jiffies_msec; + } +} +#else +static void fsp_packet_debug(unsigned char packet[]) +{ +} +#endif + +static psmouse_ret_t fsp_process_byte(struct psmouse *psmouse) +{ + struct input_dev *dev = psmouse->dev; + struct fsp_data *ad = psmouse->private; + unsigned char *packet = psmouse->packet; + unsigned char button_status = 0, lscroll = 0, rscroll = 0; + int rel_x, rel_y; + + if (psmouse->pktcnt < 4) + return PSMOUSE_GOOD_DATA; + + /* + * Full packet accumulated, process it + */ + + switch (psmouse->packet[0] >> FSP_PKT_TYPE_SHIFT) { + case FSP_PKT_TYPE_ABS: + dev_warn(&psmouse->ps2dev.serio->dev, + "Unexpected absolute mode packet, ignored.\n"); + break; + + case FSP_PKT_TYPE_NORMAL_OPC: + /* on-pad click, filter it if necessary */ + if ((ad->flags & FSPDRV_FLAG_EN_OPC) != FSPDRV_FLAG_EN_OPC) + packet[0] &= ~BIT(0); + /* fall through */ + + case FSP_PKT_TYPE_NORMAL: + /* normal packet */ + /* special packet data translation from on-pad packets */ + if (packet[3] != 0) { + if (packet[3] & BIT(0)) + button_status |= 0x01; /* wheel down */ + if (packet[3] & BIT(1)) + button_status |= 0x0f; /* wheel up */ + if (packet[3] & BIT(2)) + button_status |= BIT(5);/* horizontal left */ + if (packet[3] & BIT(3)) + button_status |= BIT(4);/* horizontal right */ + /* push back to packet queue */ + if (button_status != 0) + packet[3] = button_status; + rscroll = (packet[3] >> 4) & 1; + lscroll = (packet[3] >> 5) & 1; + } + /* + * Processing wheel up/down and extra button events + */ + input_report_rel(dev, REL_WHEEL, + (int)(packet[3] & 8) - (int)(packet[3] & 7)); + input_report_rel(dev, REL_HWHEEL, lscroll - rscroll); + input_report_key(dev, BTN_BACK, lscroll); + input_report_key(dev, BTN_FORWARD, rscroll); + + /* + * Standard PS/2 Mouse + */ + input_report_key(dev, BTN_LEFT, packet[0] & 1); + input_report_key(dev, BTN_MIDDLE, (packet[0] >> 2) & 1); + input_report_key(dev, BTN_RIGHT, (packet[0] >> 1) & 1); + + rel_x = packet[1] ? (int)packet[1] - (int)((packet[0] << 4) & 0x100) : 0; + rel_y = packet[2] ? (int)((packet[0] << 3) & 0x100) - (int)packet[2] : 0; + + input_report_rel(dev, REL_X, rel_x); + input_report_rel(dev, REL_Y, rel_y); + break; + } + + input_sync(dev); + + fsp_packet_debug(packet); + + return PSMOUSE_FULL_PACKET; +} + +static int fsp_activate_protocol(struct psmouse *psmouse) +{ + struct fsp_data *pad = psmouse->private; + struct ps2dev *ps2dev = &psmouse->ps2dev; + unsigned char param[2]; + int val; + + /* + * Standard procedure to enter FSP Intellimouse mode + * (scrolling wheel, 4th and 5th buttons) + */ + param[0] = 200; + ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE); + param[0] = 200; + ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE); + param[0] = 80; + ps2_command(ps2dev, param, PSMOUSE_CMD_SETRATE); + + ps2_command(ps2dev, param, PSMOUSE_CMD_GETID); + if (param[0] != 0x04) { + dev_err(&psmouse->ps2dev.serio->dev, + "Unable to enable 4 bytes packet format.\n"); + return -EIO; + } + + if (fsp_reg_read(psmouse, FSP_REG_SYSCTL5, &val)) { + dev_err(&psmouse->ps2dev.serio->dev, + "Unable to read SYSCTL5 register.\n"); + return -EIO; + } + + val &= ~(FSP_BIT_EN_MSID7 | FSP_BIT_EN_MSID8 | FSP_BIT_EN_AUTO_MSID8); + /* Ensure we are not in absolute mode */ + val &= ~FSP_BIT_EN_PKT_G0; + if (pad->buttons == 0x06) { + /* Left/Middle/Right & Scroll Up/Down/Right/Left */ + val |= FSP_BIT_EN_MSID6; + } + + if (fsp_reg_write(psmouse, FSP_REG_SYSCTL5, val)) { + dev_err(&psmouse->ps2dev.serio->dev, + "Unable to set up required mode bits.\n"); + return -EIO; + } + + /* + * Enable OPC tags such that driver can tell the difference between + * on-pad and real button click + */ + if (fsp_opc_tag_enable(psmouse, true)) + dev_warn(&psmouse->ps2dev.serio->dev, + "Failed to enable OPC tag mode.\n"); + + /* Enable on-pad vertical and horizontal scrolling */ + fsp_onpad_vscr(psmouse, true); + fsp_onpad_hscr(psmouse, true); + + return 0; +} + +int fsp_detect(struct psmouse *psmouse, int set_properties) +{ + int id; + + if (fsp_reg_read(psmouse, FSP_REG_DEVICE_ID, &id)) + return -EIO; + + if (id != 0x01) + return -ENODEV; + + if (set_properties) { + psmouse->vendor = "Sentelic"; + psmouse->name = "FingerSensingPad"; + } + + return 0; +} + +static void fsp_reset(struct psmouse *psmouse) +{ + fsp_opc_tag_enable(psmouse, false); + fsp_onpad_vscr(psmouse, false); + fsp_onpad_hscr(psmouse, false); +} + +static void fsp_disconnect(struct psmouse *psmouse) +{ + sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj, + &fsp_attribute_group); + + fsp_reset(psmouse); + kfree(psmouse->private); +} + +static int fsp_reconnect(struct psmouse *psmouse) +{ + int version; + + if (fsp_detect(psmouse, 0)) + return -ENODEV; + + if (fsp_get_version(psmouse, &version)) + return -ENODEV; + + if (fsp_activate_protocol(psmouse)) + return -EIO; + + return 0; +} + +int fsp_init(struct psmouse *psmouse) +{ + struct fsp_data *priv; + int ver, rev, buttons; + int error; + + if (fsp_get_version(psmouse, &ver) || + fsp_get_revision(psmouse, &rev) || + fsp_get_buttons(psmouse, &buttons)) { + return -ENODEV; + } + + printk(KERN_INFO + "Finger Sensing Pad, hw: %d.%d.%d, sw: %s, buttons: %d\n", + ver >> 4, ver & 0x0F, rev, fsp_drv_ver, buttons & 7); + + psmouse->private = priv = kzalloc(sizeof(struct fsp_data), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->ver = ver; + priv->rev = rev; + priv->buttons = buttons; + + /* enable on-pad click by default */ + priv->flags |= FSPDRV_FLAG_EN_OPC; + + /* Set up various supported input event bits */ + __set_bit(BTN_BACK, psmouse->dev->keybit); + __set_bit(BTN_FORWARD, psmouse->dev->keybit); + __set_bit(REL_WHEEL, psmouse->dev->relbit); + __set_bit(REL_HWHEEL, psmouse->dev->relbit); + + psmouse->protocol_handler = fsp_process_byte; + psmouse->disconnect = fsp_disconnect; + psmouse->reconnect = fsp_reconnect; + psmouse->cleanup = fsp_reset; + psmouse->pktsize = 4; + + /* set default packet output based on number of buttons we found */ + error = fsp_activate_protocol(psmouse); + if (error) + goto err_out; + + error = sysfs_create_group(&psmouse->ps2dev.serio->dev.kobj, + &fsp_attribute_group); + if (error) { + dev_err(&psmouse->ps2dev.serio->dev, + "Failed to create sysfs attributes (%d)", error); + goto err_out; + } + + return 0; + + err_out: + kfree(psmouse->private); + psmouse->private = NULL; + return error; +} diff --git a/drivers/input/mouse/sentelic.h b/drivers/input/mouse/sentelic.h new file mode 100644 index 000000000000..083559c7282b --- /dev/null +++ b/drivers/input/mouse/sentelic.h @@ -0,0 +1,98 @@ +/*- + * Finger Sensing Pad PS/2 mouse driver. + * + * Copyright (C) 2005-2007 Asia Vital Components Co., Ltd. + * Copyright (C) 2005-2009 Tai-hwa Liang, Sentelic Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __SENTELIC_H +#define __SENTELIC_H + +/* Finger-sensing Pad information registers */ +#define FSP_REG_DEVICE_ID 0x00 +#define FSP_REG_VERSION 0x01 +#define FSP_REG_REVISION 0x04 +#define FSP_REG_TMOD_STATUS1 0x0B +#define FSP_BIT_NO_ROTATION BIT(3) +#define FSP_REG_PAGE_CTRL 0x0F + +/* Finger-sensing Pad control registers */ +#define FSP_REG_SYSCTL1 0x10 +#define FSP_BIT_EN_REG_CLK BIT(5) +#define FSP_REG_OPC_QDOWN 0x31 +#define FSP_BIT_EN_OPC_TAG BIT(7) +#define FSP_REG_OPTZ_XLO 0x34 +#define FSP_REG_OPTZ_XHI 0x35 +#define FSP_REG_OPTZ_YLO 0x36 +#define FSP_REG_OPTZ_YHI 0x37 +#define FSP_REG_SYSCTL5 0x40 +#define FSP_BIT_90_DEGREE BIT(0) +#define FSP_BIT_EN_MSID6 BIT(1) +#define FSP_BIT_EN_MSID7 BIT(2) +#define FSP_BIT_EN_MSID8 BIT(3) +#define FSP_BIT_EN_AUTO_MSID8 BIT(5) +#define FSP_BIT_EN_PKT_G0 BIT(6) + +#define FSP_REG_ONPAD_CTL 0x43 +#define FSP_BIT_ONPAD_ENABLE BIT(0) +#define FSP_BIT_ONPAD_FBBB BIT(1) +#define FSP_BIT_FIX_VSCR BIT(3) +#define FSP_BIT_FIX_HSCR BIT(5) +#define FSP_BIT_DRAG_LOCK BIT(6) + +/* Finger-sensing Pad packet formating related definitions */ + +/* absolute packet type */ +#define FSP_PKT_TYPE_NORMAL (0x00) +#define FSP_PKT_TYPE_ABS (0x01) +#define FSP_PKT_TYPE_NOTIFY (0x02) +#define FSP_PKT_TYPE_NORMAL_OPC (0x03) +#define FSP_PKT_TYPE_SHIFT (6) + +#ifdef __KERNEL__ + +struct fsp_data { + unsigned char ver; /* hardware version */ + unsigned char rev; /* hardware revison */ + unsigned char buttons; /* Number of buttons */ + unsigned int flags; +#define FSPDRV_FLAG_EN_OPC (0x001) /* enable on-pad clicking */ + + bool vscroll; /* Vertical scroll zone enabled */ + bool hscroll; /* Horizontal scroll zone enabled */ + + unsigned char last_reg; /* Last register we requested read from */ + unsigned char last_val; +}; + +#ifdef CONFIG_MOUSE_PS2_SENTELIC +extern int fsp_detect(struct psmouse *psmouse, int set_properties); +extern int fsp_init(struct psmouse *psmouse); +#else +inline int fsp_detect(struct psmouse *psmouse, int set_properties) +{ + return -ENOSYS; +} +inline int fsp_init(struct psmouse *psmouse) +{ + return -ENOSYS; +} +#endif + +#endif /* __KERNEL__ */ + +#endif /* !__SENTELIC_H */ diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index be5bbbb8ae4e..3a95b508bf27 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -161,7 +161,7 @@ static int ps2_adjust_timeout(struct ps2dev *ps2dev, int command, int timeout) * ps2_command() can only be called from a process context */ -int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) +int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) { int timeout; int send = (command >> 12) & 0xf; @@ -179,8 +179,6 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) return -1; } - mutex_lock(&ps2dev->cmd_mutex); - serio_pause_rx(ps2dev->serio); ps2dev->flags = command == PS2_CMD_GETID ? PS2_FLAG_WAITID : 0; ps2dev->cmdcnt = receive; @@ -231,7 +229,18 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) ps2dev->flags = 0; serio_continue_rx(ps2dev->serio); + return rc; +} +EXPORT_SYMBOL(__ps2_command); + +int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command) +{ + int rc; + + mutex_lock(&ps2dev->cmd_mutex); + rc = __ps2_command(ps2dev, param, command); mutex_unlock(&ps2dev->cmd_mutex); + return rc; } EXPORT_SYMBOL(ps2_command); diff --git a/include/linux/libps2.h b/include/linux/libps2.h index b94534b7e266..fcf5fbe6a50c 100644 --- a/include/linux/libps2.h +++ b/include/linux/libps2.h @@ -44,6 +44,7 @@ struct ps2dev { void ps2_init(struct ps2dev *ps2dev, struct serio *serio); int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout); void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout); +int __ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command); int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command); int ps2_handle_ack(struct ps2dev *ps2dev, unsigned char data); int ps2_handle_response(struct ps2dev *ps2dev, unsigned char data); -- cgit v1.2.3-71-gd317 From 4516fc0454e7ffe2f369e80045b23c2b32155004 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Aug 2009 12:57:54 -0400 Subject: sunrpc: add routine for comparing addresses lockd needs these sort of routines, as does the NFSv4 callback code. Move lockd's routines into common code and rename them so that they can be used by others. Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/clntlock.c | 2 +- fs/lockd/host.c | 4 ++-- fs/lockd/mon.c | 2 +- fs/lockd/svcsubs.c | 2 +- include/linux/lockd/lockd.h | 43 ---------------------------------------- include/linux/sunrpc/clnt.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc0d351..fc9032dc8862 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7cb076ac6b45..4600c2037b8b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -111,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) */ chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) + if (!rpc_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ @@ -125,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) if (host->h_server != ni->server) continue; if (ni->server && - !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) + !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 30c933188dd7..f956651d0f65 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -209,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) struct nsm_handle *nsm; list_for_each_entry(nsm, &nsm_handles, sm_link) - if (nlm_cmp_addr(nsm_addr(nsm), sap)) + if (rpc_cmp_addr(nsm_addr(nsm), sap)) return nsm; return NULL; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 9e4d6aab611b..ad478da7ca63 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(nlm_srcaddr(host), datap); + return rpc_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c325b187966b..e7a251a988c0 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -338,49 +338,6 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) } } -static inline int __nlm_cmp_addr4(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; - return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); -} -#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ -static inline int __nlm_cmp_addr6(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - return 0; -} -#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ - -/* - * Compare two host addresses - * - * Return TRUE if the addresses are the same; otherwise FALSE. - */ -static inline int nlm_cmp_addr(const struct sockaddr *sap1, - const struct sockaddr *sap2) -{ - if (sap1->sa_family == sap2->sa_family) { - switch (sap1->sa_family) { - case AF_INET: - return __nlm_cmp_addr4(sap1, sap2); - case AF_INET6: - return __nlm_cmp_addr6(sap1, sap2); - } - } - return 0; -} - /* * Compare two NLM locks. * When the second lock is of type F_UNLCK, this acts like a wildcard. diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ab3f6e90caa5..b17df361be82 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -22,6 +22,7 @@ #include #include #include +#include struct rpc_inode; @@ -188,5 +189,52 @@ static inline void rpc_set_port(struct sockaddr *sap, #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") +static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; + + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + return false; +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + +/** + * rpc_cmp_addr - compare the address portion of two sockaddrs. + * @sap1: first sockaddr + * @sap2: second sockaddr + * + * Just compares the family and address portion. Ignores port, scope, etc. + * Returns true if the addrs are equal, false if they aren't. + */ +static inline bool rpc_cmp_addr(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + if (sap1->sa_family == sap2->sa_family) { + switch (sap1->sa_family) { + case AF_INET: + return __rpc_cmp_addr4(sap1, sap2); + case AF_INET6: + return __rpc_cmp_addr6(sap1, sap2); + } + } + return false; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ -- cgit v1.2.3-71-gd317 From be3ad6b0b675fd1d6b48362ca30bdee75fbef6b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Aug 2009 12:57:55 -0400 Subject: sunrpc: add common routine for copying address portion of a sockaddr Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/clnt.h | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b17df361be82..044f531aee70 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -198,6 +198,17 @@ static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1, return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } +static inline bool __rpc_copy_addr4(struct sockaddr *dst, + const struct sockaddr *src) +{ + const struct sockaddr_in *ssin = (struct sockaddr_in *) src; + struct sockaddr_in *dsin = (struct sockaddr_in *) dst; + + dsin->sin_family = ssin->sin_family; + dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; + return true; +} + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) @@ -206,12 +217,29 @@ static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } + +static inline bool __rpc_copy_addr6(struct sockaddr *dst, + const struct sockaddr *src) +{ + const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; + struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; + + dsin6->sin6_family = ssin6->sin6_family; + ipv6_addr_copy(&dsin6->sin6_addr, &ssin6->sin6_addr); + return true; +} #else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; } + +static inline bool __rpc_copy_addr6(struct sockaddr *dst, + const struct sockaddr *src) +{ + return false; +} #endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ /** @@ -236,5 +264,27 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, return false; } +/** + * rpc_copy_addr - copy the address portion of one sockaddr to another + * @dst: destination sockaddr + * @src: source sockaddr + * + * Just copies the address portion and family. Ignores port, scope, etc. + * Caller is responsible for making certain that dst is large enough to hold + * the address in src. Returns true if address family is supported. Returns + * false otherwise. + */ +static inline bool rpc_copy_addr(struct sockaddr *dst, + const struct sockaddr *src) +{ + switch (src->sa_family) { + case AF_INET: + return __rpc_copy_addr4(dst, src); + case AF_INET6: + return __rpc_copy_addr6(dst, src); + } + return false; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ -- cgit v1.2.3-71-gd317 From 363168b4ea8ec26aeb982ac6024a09f907ecd27e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Aug 2009 12:57:56 -0400 Subject: nfsd: make nfs4_client->cl_addr a struct sockaddr_storage It's currently a __be32, which isn't big enough to hold an IPv6 address. Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 32 +++++++++++++++++++------------- include/linux/nfsd/state.h | 2 +- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9295c4b56bce..bfc14d879ea1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -55,6 +55,7 @@ #include #include #include +#include #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -1220,13 +1221,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, int status; unsigned int strhashval; char dname[HEXDIR_LEN]; + char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct sockaddr *sa = svc_addr(rqstp); + rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - " ip_addr=%u flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %d\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, - ip_addr, exid->flags, exid->spa_how); + addr_str, exid->flags, exid->spa_how); if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) return nfserr_inval; @@ -1315,7 +1318,7 @@ out_new: copy_verf(new, &verf); copy_cred(&new->cl_cred, &rqstp->rq_cred); - new->cl_addr = ip_addr; + rpc_copy_addr((struct sockaddr *) &new->cl_addr, sa); gen_clid(new); gen_confirm(new); add_to_unconfirmed(new, strhashval); @@ -1389,7 +1392,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_create_session *cr_ses) { - u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; struct nfsd4_clid_slot *cs_slot = NULL; int status = 0; @@ -1417,7 +1420,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, cs_slot->sl_seqid++; } else if (unconf) { if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || - (ip_addr != unconf->cl_addr)) { + !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { status = nfserr_clid_inuse; goto out; } @@ -1564,7 +1567,7 @@ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct xdr_netobj clname = { .len = setclid->se_namelen, .data = setclid->se_name, @@ -1596,8 +1599,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - dprintk("NFSD: setclientid: string in use by client" - " at %pI4\n", &conf->cl_addr); + char addr_str[INET6_ADDRSTRLEN]; + rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, + sizeof(addr_str)); + dprintk("NFSD: setclientid: string in use by client " + "at %s\n", addr_str); goto out; } } @@ -1659,7 +1665,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, gen_clid(new); } copy_verf(new, &clverifier); - new->cl_addr = sin->sin_addr.s_addr; + rpc_copy_addr((struct sockaddr *) &new->cl_addr, sa); new->cl_flavor = rqstp->rq_flavor; princ = svc_gss_principal(rqstp); if (princ) { @@ -1693,7 +1699,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid_confirm *setclientid_confirm) { - struct sockaddr_in *sin = svc_addr_in(rqstp); + struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; @@ -1712,9 +1718,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unconf = find_unconfirmed_client(clid); status = nfserr_clid_inuse; - if (conf && conf->cl_addr != sin->sin_addr.s_addr) + if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa)) goto out; - if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) + if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa)) goto out; /* diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 58bb19784e12..3510ddd4be49 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -200,7 +200,7 @@ struct nfs4_client { char cl_recdir[HEXDIR_LEN]; /* recovery dir */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ - __be32 cl_addr; /* client ipaddress */ + struct sockaddr_storage cl_addr; /* client ipaddress */ u32 cl_flavor; /* setclientid pseudoflavor */ char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ -- cgit v1.2.3-71-gd317 From aa9a4ec7707a5391cde556f3fa1b0eb4bca3bcf6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Aug 2009 12:57:57 -0400 Subject: nfsd: convert nfs4_cb_conn struct to hold address in sockaddr_storage ...rather than as a separate address and port fields. This will be necessary for implementing callbacks over IPv6. Also, convert gen_callback to use the standard rpcuaddr2sockaddr routine rather than its own private one. Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 11 ++----- fs/nfsd/nfs4state.c | 81 ++++++---------------------------------------- include/linux/nfsd/state.h | 4 +-- 3 files changed, 13 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3fd23f7aceca..81d1c5285dcc 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -377,7 +377,6 @@ static int max_cb_time(void) int setup_callback_client(struct nfs4_client *clp) { - struct sockaddr_in addr; struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_timeout timeparms = { .to_initval = max_cb_time(), @@ -385,8 +384,8 @@ int setup_callback_client(struct nfs4_client *clp) }; struct rpc_create_args args = { .protocol = IPPROTO_TCP, - .address = (struct sockaddr *)&addr, - .addrsize = sizeof(addr), + .address = (struct sockaddr *) &cb->cb_addr, + .addrsize = cb->cb_addrlen, .timeout = &timeparms, .program = &cb_program, .prognumber = cb->cb_prog, @@ -400,12 +399,6 @@ int setup_callback_client(struct nfs4_client *clp) if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; - /* Initialize address */ - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(cb->cb_port); - addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bfc14d879ea1..96a742308cee 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -897,76 +897,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, return NULL; } -/* a helper function for parse_callback */ -static int -parse_octet(unsigned int *lenp, char **addrp) -{ - unsigned int len = *lenp; - char *p = *addrp; - int n = -1; - char c; - - for (;;) { - if (!len) - break; - len--; - c = *p++; - if (c == '.') - break; - if ((c < '0') || (c > '9')) { - n = -1; - break; - } - if (n < 0) - n = 0; - n = (n * 10) + (c - '0'); - if (n > 255) { - n = -1; - break; - } - } - *lenp = len; - *addrp = p; - return n; -} - -/* parse and set the setclientid ipv4 callback address */ -static int -parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp) -{ - int temp = 0; - u32 cbaddr = 0; - u16 cbport = 0; - u32 addrlen = addr_len; - char *addr = addr_val; - int i, shift; - - /* ipaddress */ - shift = 24; - for(i = 4; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbaddr |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbaddrp = cbaddr; - - /* port */ - shift = 8; - for(i = 2; i > 0 ; i--) { - if ((temp = parse_octet(&addrlen, &addr)) < 0) { - return 0; - } - cbport |= (temp << shift); - if (shift > 0) - shift -= 8; - } - *cbportp = cbport; - return 1; -} - static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) { @@ -976,14 +906,21 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) goto out_err; - if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, - &cb->cb_addr, &cb->cb_port))) + cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, + se->se_callback_addr_len, + (struct sockaddr *) &cb->cb_addr, + sizeof(cb->cb_addr)); + + if (!cb->cb_addrlen || cb->cb_addr.ss_family != AF_INET) goto out_err; + cb->cb_minorversion = 0; cb->cb_prog = se->se_callback_prog; cb->cb_ident = se->se_callback_ident; return; out_err: + cb->cb_addr.ss_family = AF_UNSPEC; + cb->cb_addrlen = 0; dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 3510ddd4be49..fb0c404c7c5c 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -81,8 +81,8 @@ struct nfs4_delegation { /* client delegation callback info */ struct nfs4_cb_conn { /* SETCLIENTID info */ - u32 cb_addr; - unsigned short cb_port; + struct sockaddr_storage cb_addr; + size_t cb_addrlen; u32 cb_prog; u32 cb_minorversion; u32 cb_ident; /* minorversion 0 only */ -- cgit v1.2.3-71-gd317 From fbf4665f41b02e757ab9d9198df65e319388e728 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Aug 2009 12:57:59 -0400 Subject: nfsd: populate sin6_scope_id on callback address with scopeid from rq_addr on SETCLIENTID call When a SETCLIENTID call comes in, one of the args given is the svc_rqst. This struct contains an rq_addr field which holds the address that sent the call. If this is an IPv6 address, then we can use the sin6_scope_id field in this address to populate the sin6_scope_id field in the callback address. AFAICT, the rq_addr.sin6_scope_id is non-zero if and only if the client mounted the server's link-local address. Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 +++++-- include/linux/sunrpc/clnt.h | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9ec0ca1ef4ea..d2a052480908 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -898,7 +898,7 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, } static void -gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) +gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) { struct nfs4_cb_conn *cb = &clp->cl_cb_conn; unsigned short expected_family; @@ -921,6 +921,9 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family) goto out_err; + if (cb->cb_addr.ss_family == AF_INET6) + ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid; + cb->cb_minorversion = 0; cb->cb_prog = se->se_callback_prog; cb->cb_ident = se->se_callback_ident; @@ -1621,7 +1624,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } copy_cred(&new->cl_cred, &rqstp->rq_cred); gen_confirm(new); - gen_callback(new, setclid); + gen_callback(new, setclid, rpc_get_scope_id(sa)); add_to_unconfirmed(new, strhashval); setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; setclid->se_clientid.cl_id = new->cl_clientid.cl_id; diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 044f531aee70..3d025588e56e 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -286,5 +286,20 @@ static inline bool rpc_copy_addr(struct sockaddr *dst, return false; } +/** + * rpc_get_scope_id - return scopeid for a given sockaddr + * @sa: sockaddr to get scopeid from + * + * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if + * not an AF_INET6 address. + */ +static inline u32 rpc_get_scope_id(const struct sockaddr *sa) +{ + if (sa->sa_family != AF_INET6) + return 0; + + return ((struct sockaddr_in6 *) sa)->sin6_scope_id; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ -- cgit v1.2.3-71-gd317 From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 19 Aug 2009 19:13:34 -0700 Subject: time: Introduce CLOCK_REALTIME_COARSE After talking with some application writers who want very fast, but not fine-grained timestamps, I decided to try to implement new clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This is very fast as we don't have to access any hardware (which can be very painful if you're using something like the acpi_pm clocksource), and we can even use the vdso clock_gettime() method to avoid the syscall. The only trade off is you only get low-res tick grained time resolution. This isn't a new idea, I know Ingo has a patch in the -rt tree that made the vsyscall gettimeofday() return coarse grained time when the vsyscall64 sysctrl was set to 2. However this affects all applications on a system. With this method, applications can choose the proper speed/granularity trade-off for themselves. Signed-off-by: John Stultz Cc: Andi Kleen Cc: nikolag@ca.ibm.com Cc: Darren Hart Cc: arjan@infradead.org Cc: jonathan@jonmasters.org LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/vgtod.h | 1 + arch/x86/kernel/vsyscall_64.c | 1 + arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++--- include/linux/time.h | 4 ++++ kernel/posix-timers.c | 35 +++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69e5d2a..3d61e204826f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -21,6 +21,7 @@ struct vsyscall_gtod_data { u32 shift; } clock; struct timespec wall_to_monotonic; + struct timespec wall_time_coarse; }; extern struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aad..cf53a78e2dcf 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78b46aa..ee55754cc3c5 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts) return 0; } +notrace static noinline int do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_coarse.tv_sec; + ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + return 0; +} + +notrace static noinline int do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_coarse.tv_sec; + ns = gtod->wall_time_coarse.tv_nsec; + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + if (likely(gtod->sysctl_enabled)) switch (clock) { case CLOCK_REALTIME: - return do_realtime(ts); + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; case CLOCK_MONOTONIC: - return do_monotonic(ts); + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } diff --git a/include/linux/time.h b/include/linux/time.h index f505988398e6..256232f7e5e6 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,6 +110,8 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); +struct timespec __current_kernel_time(void); /* does not hold xtime_lock */ +struct timespec get_monotonic_coarse(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) @@ -243,6 +245,8 @@ struct itimerval { #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 #define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d052c4a9..495440779ce3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 15e06defca55..03cbeb34d141 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -847,6 +847,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -862,3 +866,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} -- cgit v1.2.3-71-gd317 From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Aug 2009 23:38:32 +0200 Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17) Introduce a core framework for run-time power management of I/O devices. Add device run-time PM fields to 'struct dev_pm_info' and device run-time PM callbacks to 'struct dev_pm_ops'. Introduce a run-time PM workqueue and define some device run-time PM helper functions at the core level. Document all these things. Special thanks to Alan Stern for his help with the design and multiple detailed reviews of the pereceding versions of this patch and to Magnus Damm for testing feedback. Signed-off-by: Rafael J. Wysocki Acked-by: Magnus Damm --- Documentation/power/runtime_pm.txt | 378 ++++++++++++++ drivers/base/dd.c | 11 + drivers/base/power/Makefile | 1 + drivers/base/power/main.c | 22 +- drivers/base/power/power.h | 31 +- drivers/base/power/runtime.c | 1011 ++++++++++++++++++++++++++++++++++++ include/linux/pm.h | 101 +++- include/linux/pm_runtime.h | 114 ++++ kernel/power/Kconfig | 14 + kernel/power/main.c | 17 + 10 files changed, 1689 insertions(+), 11 deletions(-) create mode 100644 Documentation/power/runtime_pm.txt create mode 100644 drivers/base/power/runtime.c create mode 100644 include/linux/pm_runtime.h (limited to 'include/linux') diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt new file mode 100644 index 000000000000..f49a33b704d2 --- /dev/null +++ b/Documentation/power/runtime_pm.txt @@ -0,0 +1,378 @@ +Run-time Power Management Framework for I/O Devices + +(C) 2009 Rafael J. Wysocki , Novell Inc. + +1. Introduction + +Support for run-time power management (run-time PM) of I/O devices is provided +at the power management core (PM core) level by means of: + +* The power management workqueue pm_wq in which bus types and device drivers can + put their PM-related work items. It is strongly recommended that pm_wq be + used for queuing all work items related to run-time PM, because this allows + them to be synchronized with system-wide power transitions (suspend to RAM, + hibernation and resume from system sleep states). pm_wq is declared in + include/linux/pm_runtime.h and defined in kernel/power/main.c. + +* A number of run-time PM fields in the 'power' member of 'struct device' (which + is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can + be used for synchronizing run-time PM operations with one another. + +* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in + include/linux/pm.h). + +* A set of helper functions defined in drivers/base/power/runtime.c that can be + used for carrying out run-time PM operations in such a way that the + synchronization between them is taken care of by the PM core. Bus types and + device drivers are encouraged to use these functions. + +The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM +fields of 'struct dev_pm_info' and the core helper functions provided for +run-time PM are described below. + +2. Device Run-time PM Callbacks + +There are three device run-time PM callbacks defined in 'struct dev_pm_ops': + +struct dev_pm_ops { + ... + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + void (*runtime_idle)(struct device *dev); + ... +}; + +The ->runtime_suspend() callback is executed by the PM core for the bus type of +the device being suspended. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_suspend() +callback in a device driver as long as the bus type's ->runtime_suspend() knows +what to do to handle the device). + + * Once the bus type's ->runtime_suspend() callback has completed successfully + for given device, the PM core regards the device as suspended, which need + not mean that the device has been put into a low power state. It is + supposed to mean, however, that the device will not process data and will + not communicate with the CPU(s) and RAM until its bus type's + ->runtime_resume() callback is executed for it. The run-time PM status of + a device after successful execution of its bus type's ->runtime_suspend() + callback is 'suspended'. + + * If the bus type's ->runtime_suspend() callback returns -EBUSY or -EAGAIN, + the device's run-time PM status is supposed to be 'active', which means that + the device _must_ be fully operational afterwards. + + * If the bus type's ->runtime_suspend() callback returns an error code + different from -EBUSY or -EAGAIN, the PM core regards this as a fatal + error and will refuse to run the helper functions described in Section 4 + for the device, until the status of it is directly set either to 'active' + or to 'suspended' (the PM core provides special helper functions for this + purpose). + +In particular, if the driver requires remote wakeup capability for proper +functioning and device_may_wakeup() returns 'false' for the device, then +->runtime_suspend() should return -EBUSY. On the other hand, if +device_may_wakeup() returns 'true' for the device and the device is put +into a low power state during the execution of its bus type's +->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism +allowing the device to request a change of its power state, such as PCI PME) +will be enabled for the device. Generally, remote wake-up should be enabled +for all input devices put into a low power state at run time. + +The ->runtime_resume() callback is executed by the PM core for the bus type of +the device being woken up. The bus type's callback is then _entirely_ +_responsible_ for handling the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the bus type's ->runtime_resume() knows +what to do to handle the device). + + * Once the bus type's ->runtime_resume() callback has completed successfully, + the PM core regards the device as fully operational, which means that the + device _must_ be able to complete I/O operations as needed. The run-time + PM status of the device is then 'active'. + + * If the bus type's ->runtime_resume() callback returns an error code, the PM + core regards this as a fatal error and will refuse to run the helper + functions described in Section 4 for the device, until its status is + directly set either to 'active' or to 'suspended' (the PM core provides + special helper functions for this purpose). + +The ->runtime_idle() callback is executed by the PM core for the bus type of +given device whenever the device appears to be idle, which is indicated to the +PM core by two counters, the device's usage counter and the counter of 'active' +children of the device. + + * If any of these counters is decreased using a helper function provided by + the PM core and it turns out to be equal to zero, the other counter is + checked. If that counter also is equal to zero, the PM core executes the + device bus type's ->runtime_idle() callback (with the device as an + argument). + +The action performed by a bus type's ->runtime_idle() callback is totally +dependent on the bus type in question, but the expected and recommended action +is to check if the device can be suspended (i.e. if all of the conditions +necessary for suspending the device are satisfied) and to queue up a suspend +request for the device in that case. + +The helper functions provided by the PM core, described in Section 4, guarantee +that the following constraints are met with respect to the bus type's run-time +PM callbacks: + +(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute + ->runtime_suspend() in parallel with ->runtime_resume() or with another + instance of ->runtime_suspend() for the same device) with the exception that + ->runtime_suspend() or ->runtime_resume() can be executed in parallel with + ->runtime_idle() (although ->runtime_idle() will not be started while any + of the other callbacks is being executed for the same device). + +(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' + devices (i.e. the PM core will only execute ->runtime_idle() or + ->runtime_suspend() for the devices the run-time PM status of which is + 'active'). + +(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device + the usage counter of which is equal to zero _and_ either the counter of + 'active' children of which is equal to zero, or the 'power.ignore_children' + flag of which is set. + +(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the + PM core will only execute ->runtime_resume() for the devices the run-time + PM status of which is 'suspended'). + +Additionally, the helper functions provided by the PM core obey the following +rules: + + * If ->runtime_suspend() is about to be executed or there's a pending request + to execute it, ->runtime_idle() will not be executed for the same device. + + * A request to execute or to schedule the execution of ->runtime_suspend() + will cancel any pending requests to execute ->runtime_idle() for the same + device. + + * If ->runtime_resume() is about to be executed or there's a pending request + to execute it, the other callbacks will not be executed for the same device. + + * A request to execute ->runtime_resume() will cancel any pending or + scheduled requests to execute the other callbacks for the same device. + +3. Run-time PM Device Fields + +The following device run-time PM fields are present in 'struct dev_pm_info', as +defined in include/linux/pm.h: + + struct timer_list suspend_timer; + - timer used for scheduling (delayed) suspend request + + unsigned long timer_expires; + - timer expiration time, in jiffies (if this is different from zero, the + timer is running and will expire at that time, otherwise the timer is not + running) + + struct work_struct work; + - work structure used for queuing up requests (i.e. work items in pm_wq) + + wait_queue_head_t wait_queue; + - wait queue used if any of the helper functions needs to wait for another + one to complete + + spinlock_t lock; + - lock used for synchronisation + + atomic_t usage_count; + - the usage counter of the device + + atomic_t child_count; + - the count of 'active' children of the device + + unsigned int ignore_children; + - if set, the value of child_count is ignored (but still updated) + + unsigned int disable_depth; + - used for disabling the helper funcions (they work normally if this is + equal to zero); the initial value of it is 1 (i.e. run-time PM is + initially disabled for all devices) + + unsigned int runtime_error; + - if set, there was a fatal error (one of the callbacks returned error code + as described in Section 2), so the helper funtions will not work until + this flag is cleared; this is the error code returned by the failing + callback + + unsigned int idle_notification; + - if set, ->runtime_idle() is being executed + + unsigned int request_pending; + - if set, there's a pending request (i.e. a work item queued up into pm_wq) + + enum rpm_request request; + - type of request that's pending (valid if request_pending is set) + + unsigned int deferred_resume; + - set if ->runtime_resume() is about to be run while ->runtime_suspend() is + being executed for that device and it is not practical to wait for the + suspend to complete; means "start a resume as soon as you've suspended" + + enum rpm_status runtime_status; + - the run-time PM status of the device; this field's initial value is + RPM_SUSPENDED, which means that each device is initially regarded by the + PM core as 'suspended', regardless of its real hardware status + +All of the above fields are members of the 'power' member of 'struct device'. + +4. Run-time PM Device Helper Functions + +The following run-time PM helper functions are defined in +drivers/base/power/runtime.c and include/linux/pm_runtime.h: + + void pm_runtime_init(struct device *dev); + - initialize the device run-time PM fields in 'struct dev_pm_info' + + void pm_runtime_remove(struct device *dev); + - make sure that the run-time PM of the device will be disabled after + removing the device from device hierarchy + + int pm_runtime_idle(struct device *dev); + - execute ->runtime_idle() for the device's bus type; returns 0 on success + or error code on failure, where -EINPROGRESS means that ->runtime_idle() + is already being executed + + int pm_runtime_suspend(struct device *dev); + - execute ->runtime_suspend() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'suspended', or + error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt + to suspend the device again in future + + int pm_runtime_resume(struct device *dev); + - execute ->runtime_resume() for the device's bus type; returns 0 on + success, 1 if the device's run-time PM status was already 'active' or + error code on failure, where -EAGAIN means it may be safe to attempt to + resume the device again in future, but 'power.runtime_error' should be + checked additionally + + int pm_request_idle(struct device *dev); + - submit a request to execute ->runtime_idle() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on success + or error code if the request has not been queued up + + int pm_schedule_suspend(struct device *dev, unsigned int delay); + - schedule the execution of ->runtime_suspend() for the device's bus type + in future, where 'delay' is the time to wait before queuing up a suspend + work item in pm_wq, in milliseconds (if 'delay' is zero, the work item is + queued up immediately); returns 0 on success, 1 if the device's PM + run-time status was already 'suspended', or error code if the request + hasn't been scheduled (or queued up if 'delay' is 0); if the execution of + ->runtime_suspend() is already scheduled and not yet expired, the new + value of 'delay' will be used as the time to wait + + int pm_request_resume(struct device *dev); + - submit a request to execute ->runtime_resume() for the device's bus type + (the request is represented by a work item in pm_wq); returns 0 on + success, 1 if the device's run-time PM status was already 'active', or + error code if the request hasn't been queued up + + void pm_runtime_get_noresume(struct device *dev); + - increment the device's usage counter + + int pm_runtime_get(struct device *dev); + - increment the device's usage counter, run pm_request_resume(dev) and + return its result + + int pm_runtime_get_sync(struct device *dev); + - increment the device's usage counter, run pm_runtime_resume(dev) and + return its result + + void pm_runtime_put_noidle(struct device *dev); + - decrement the device's usage counter + + int pm_runtime_put(struct device *dev); + - decrement the device's usage counter, run pm_request_idle(dev) and return + its result + + int pm_runtime_put_sync(struct device *dev); + - decrement the device's usage counter, run pm_runtime_idle(dev) and return + its result + + void pm_runtime_enable(struct device *dev); + - enable the run-time PM helper functions to run the device bus type's + run-time PM callbacks described in Section 2 + + int pm_runtime_disable(struct device *dev); + - prevent the run-time PM helper functions from running the device bus + type's run-time PM callbacks, make sure that all of the pending run-time + PM operations on the device are either completed or canceled; returns + 1 if there was a resume request pending and it was necessary to execute + ->runtime_resume() for the device's bus type to satisfy that request, + otherwise 0 is returned + + void pm_suspend_ignore_children(struct device *dev, bool enable); + - set/unset the power.ignore_children flag of the device + + int pm_runtime_set_active(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'active' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero); it will fail and return error code if the device has a parent + which is not active and the 'power.ignore_children' flag of which is unset + + void pm_runtime_set_suspended(struct device *dev); + - clear the device's 'power.runtime_error' flag, set the device's run-time + PM status to 'suspended' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero) + +It is safe to execute the following helper functions from interrupt context: + +pm_request_idle() +pm_schedule_suspend() +pm_request_resume() +pm_runtime_get_noresume() +pm_runtime_get() +pm_runtime_put_noidle() +pm_runtime_put() +pm_suspend_ignore_children() +pm_runtime_set_active() +pm_runtime_set_suspended() +pm_runtime_enable() + +5. Run-time PM Initialization, Device Probing and Removal + +Initially, the run-time PM is disabled for all devices, which means that the +majority of the run-time PM helper funtions described in Section 4 will return +-EAGAIN until pm_runtime_enable() is called for the device. + +In addition to that, the initial run-time PM status of all devices is +'suspended', but it need not reflect the actual physical state of the device. +Thus, if the device is initially active (i.e. it is able to process I/O), its +run-time PM status must be changed to 'active', with the help of +pm_runtime_set_active(), before pm_runtime_enable() is called for the device. + +However, if the device has a parent and the parent's run-time PM is enabled, +calling pm_runtime_set_active() for the device will affect the parent, unless +the parent's 'power.ignore_children' flag is set. Namely, in that case the +parent won't be able to suspend at run time, using the PM core's helper +functions, as long as the child's status is 'active', even if the child's +run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for +the child yet or pm_runtime_disable() has been called for it). For this reason, +once pm_runtime_set_active() has been called for the device, pm_runtime_enable() +should be called for it too as soon as reasonably possible or its run-time PM +status should be changed back to 'suspended' with the help of +pm_runtime_set_suspended(). + +If the default initial run-time PM status of the device (i.e. 'suspended') +reflects the actual state of the device, its bus type's or its driver's +->probe() callback will likely need to wake it up using one of the PM core's +helper functions described in Section 4. In that case, pm_runtime_resume() +should be used. Of course, for this purpose the device's run-time PM has to be +enabled earlier by calling pm_runtime_enable(). + +If the device bus type's or driver's ->probe() or ->remove() callback runs +pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts, +they will fail returning -EAGAIN, because the device's usage counter is +incremented by the core before executing ->probe() and ->remove(). Still, it +may be desirable to suspend the device as soon as ->probe() or ->remove() has +finished, so the PM core uses pm_runtime_idle_sync() to invoke the device bus +type's ->runtime_idle() callback at that time. diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f0106875f01d..7b34b3a48f67 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -202,7 +203,10 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); ret = really_probe(dev, drv); + pm_runtime_put_sync(dev); return ret; } @@ -245,7 +249,9 @@ int device_attach(struct device *dev) ret = 0; } } else { + pm_runtime_get_noresume(dev); ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); + pm_runtime_put_sync(dev); } up(&dev->sem); return ret; @@ -306,6 +312,9 @@ static void __device_release_driver(struct device *dev) drv = dev->driver; if (drv) { + pm_runtime_get_noresume(dev); + pm_runtime_barrier(dev); + driver_sysfs_remove(dev); if (dev->bus) @@ -324,6 +333,8 @@ static void __device_release_driver(struct device *dev) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, dev); + + pm_runtime_put_sync(dev); } } diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 911208b89259..3ce3519e8f30 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_PM) += sysfs.o obj-$(CONFIG_PM_SLEEP) += main.o +obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 1b1a786b7dec..86990011277b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,16 @@ static DEFINE_MUTEX(dpm_list_mtx); */ static bool transition_started; +/** + * device_pm_init - Initialize the PM-related part of a device object + * @dev: Device object being initialized. + */ +void device_pm_init(struct device *dev) +{ + dev->power.status = DPM_ON; + pm_runtime_init(dev); +} + /** * device_pm_lock - lock the list of active devices used by the PM core */ @@ -105,6 +116,7 @@ void device_pm_remove(struct device *dev) mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); mutex_unlock(&dpm_list_mtx); + pm_runtime_remove(dev); } /** @@ -512,6 +524,7 @@ static void dpm_complete(pm_message_t state) mutex_unlock(&dpm_list_mtx); device_complete(dev, state); + pm_runtime_put_noidle(dev); mutex_lock(&dpm_list_mtx); } @@ -757,7 +770,14 @@ static int dpm_prepare(pm_message_t state) dev->power.status = DPM_PREPARING; mutex_unlock(&dpm_list_mtx); - error = device_prepare(dev, state); + pm_runtime_get_noresume(dev); + if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) { + /* Wake-up requested during system sleep transition. */ + pm_runtime_put_noidle(dev); + error = -EBUSY; + } else { + error = device_prepare(dev, state); + } mutex_lock(&dpm_list_mtx); if (error) { diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index c7cb4fc3735c..b8fa1aa5225a 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -1,7 +1,14 @@ -static inline void device_pm_init(struct device *dev) -{ - dev->power.status = DPM_ON; -} +#ifdef CONFIG_PM_RUNTIME + +extern void pm_runtime_init(struct device *dev); +extern void pm_runtime_remove(struct device *dev); + +#else /* !CONFIG_PM_RUNTIME */ + +static inline void pm_runtime_init(struct device *dev) {} +static inline void pm_runtime_remove(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ #ifdef CONFIG_PM_SLEEP @@ -16,23 +23,33 @@ static inline struct device *to_device(struct list_head *entry) return container_of(entry, struct device, power.entry); } +extern void device_pm_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); -#else /* CONFIG_PM_SLEEP */ +#else /* !CONFIG_PM_SLEEP */ + +static inline void device_pm_init(struct device *dev) +{ + pm_runtime_init(dev); +} + +static inline void device_pm_remove(struct device *dev) +{ + pm_runtime_remove(dev); +} static inline void device_pm_add(struct device *dev) {} -static inline void device_pm_remove(struct device *dev) {} static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} -#endif +#endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c new file mode 100644 index 000000000000..38556f6cc22d --- /dev/null +++ b/drivers/base/power/runtime.c @@ -0,0 +1,1011 @@ +/* + * drivers/base/power/runtime.c - Helper functions for device run-time PM + * + * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include + +static int __pm_runtime_resume(struct device *dev, bool from_wq); +static int __pm_request_idle(struct device *dev); +static int __pm_request_resume(struct device *dev); + +/** + * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. + * @dev: Device to handle. + */ +static void pm_runtime_deactivate_timer(struct device *dev) +{ + if (dev->power.timer_expires > 0) { + del_timer(&dev->power.suspend_timer); + dev->power.timer_expires = 0; + } +} + +/** + * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. + * @dev: Device to handle. + */ +static void pm_runtime_cancel_pending(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + /* + * In case there's a request pending, make sure its work function will + * return without doing anything. + */ + dev->power.request = RPM_REQ_NONE; +} + +/** + * __pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_runtime_idle(struct device *dev) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + int retval = 0; + + dev_dbg(dev, "__pm_runtime_idle()!\n"); + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (dev->power.idle_notification) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status != RPM_ACTIVE) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.request_pending) { + /* + * If an idle notification request is pending, cancel it. Any + * other pending request takes precedence over us. + */ + if (dev->power.request == RPM_REQ_IDLE) { + dev->power.request = RPM_REQ_NONE; + } else if (dev->power.request != RPM_REQ_NONE) { + retval = -EAGAIN; + goto out; + } + } + + dev->power.idle_notification = true; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_idle) { + spin_unlock_irq(&dev->power.lock); + + dev->bus->pm->runtime_idle(dev); + + spin_lock_irq(&dev->power.lock); + } + + dev->power.idle_notification = false; + wake_up_all(&dev->power.wait_queue); + + out: + dev_dbg(dev, "__pm_runtime_idle() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_idle - Notify device bus type if the device can be suspended. + * @dev: Device to notify the bus type about. + */ +int pm_runtime_idle(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_idle(dev); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_idle); + +/** + * __pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be suspended and run the ->runtime_suspend() callback + * provided by its bus type. If another suspend has been started earlier, wait + * for it to finish. If an idle notification or suspend request is pending or + * scheduled, cancel it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_suspend(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + bool notify = false; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_suspend()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + /* Pending resume requests take precedence over us. */ + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + + /* Other scheduled or pending requests need to be canceled. */ + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING + || dev->power.disable_depth > 0 + || atomic_read(&dev->power.usage_count) > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the other suspend running in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + dev->power.runtime_status = RPM_SUSPENDING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_suspend) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_suspend(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_ACTIVE; + pm_runtime_cancel_pending(dev); + dev->power.deferred_resume = false; + + if (retval == -EAGAIN || retval == -EBUSY) { + notify = true; + dev->power.runtime_error = 0; + } + } else { + dev->power.runtime_status = RPM_SUSPENDED; + + if (dev->parent) { + parent = dev->parent; + atomic_add_unless(&parent->power.child_count, -1, 0); + } + } + wake_up_all(&dev->power.wait_queue); + + if (dev->power.deferred_resume) { + dev->power.deferred_resume = false; + __pm_runtime_resume(dev, false); + retval = -EAGAIN; + goto out; + } + + if (notify) + __pm_runtime_idle(dev); + + if (parent && !parent->power.ignore_children) { + spin_unlock_irq(&dev->power.lock); + + pm_request_idle(parent); + + spin_lock_irq(&dev->power.lock); + } + + out: + dev_dbg(dev, "__pm_runtime_suspend() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_suspend - Carry out run-time suspend of given device. + * @dev: Device to suspend. + */ +int pm_runtime_suspend(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_suspend(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_suspend); + +/** + * __pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to resume. + * @from_wq: If set, the function has been called via pm_wq. + * + * Check if the device can be woken up and run the ->runtime_resume() callback + * provided by its bus type. If another resume has been started earlier, wait + * for it to finish. If there's a suspend running in parallel with this + * function, wait for it to finish and resume the device. Cancel any scheduled + * or pending requests. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +int __pm_runtime_resume(struct device *dev, bool from_wq) + __releases(&dev->power.lock) __acquires(&dev->power.lock) +{ + struct device *parent = NULL; + int retval = 0; + + dev_dbg(dev, "__pm_runtime_resume()%s!\n", + from_wq ? " from workqueue" : ""); + + repeat: + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + pm_runtime_cancel_pending(dev); + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval) + goto out; + + if (dev->power.runtime_status == RPM_RESUMING + || dev->power.runtime_status == RPM_SUSPENDING) { + DEFINE_WAIT(wait); + + if (from_wq) { + if (dev->power.runtime_status == RPM_SUSPENDING) + dev->power.deferred_resume = true; + retval = -EINPROGRESS; + goto out; + } + + /* Wait for the operation carried out in parallel with us. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_RESUMING + && dev->power.runtime_status != RPM_SUSPENDING) + break; + + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + goto repeat; + } + + if (!parent && dev->parent) { + /* + * Increment the parent's resume counter and resume it if + * necessary. + */ + parent = dev->parent; + spin_unlock_irq(&dev->power.lock); + + pm_runtime_get_noresume(parent); + + spin_lock_irq(&parent->power.lock); + /* + * We can resume if the parent's run-time PM is disabled or it + * is set to ignore children. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children) { + __pm_runtime_resume(parent, false); + if (parent->power.runtime_status != RPM_ACTIVE) + retval = -EBUSY; + } + spin_unlock_irq(&parent->power.lock); + + spin_lock_irq(&dev->power.lock); + if (retval) + goto out; + goto repeat; + } + + dev->power.runtime_status = RPM_RESUMING; + + if (dev->bus && dev->bus->pm && dev->bus->pm->runtime_resume) { + spin_unlock_irq(&dev->power.lock); + + retval = dev->bus->pm->runtime_resume(dev); + + spin_lock_irq(&dev->power.lock); + dev->power.runtime_error = retval; + } else { + retval = -ENOSYS; + } + + if (retval) { + dev->power.runtime_status = RPM_SUSPENDED; + pm_runtime_cancel_pending(dev); + } else { + dev->power.runtime_status = RPM_ACTIVE; + if (parent) + atomic_inc(&parent->power.child_count); + } + wake_up_all(&dev->power.wait_queue); + + if (!retval) + __pm_request_idle(dev); + + out: + if (parent) { + spin_unlock_irq(&dev->power.lock); + + pm_runtime_put(parent); + + spin_lock_irq(&dev->power.lock); + } + + dev_dbg(dev, "__pm_runtime_resume() returns %d!\n", retval); + + return retval; +} + +/** + * pm_runtime_resume - Carry out run-time resume of given device. + * @dev: Device to suspend. + */ +int pm_runtime_resume(struct device *dev) +{ + int retval; + + spin_lock_irq(&dev->power.lock); + retval = __pm_runtime_resume(dev, false); + spin_unlock_irq(&dev->power.lock); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_resume); + +/** + * pm_runtime_work - Universal run-time PM work function. + * @work: Work structure used for scheduling the execution of this function. + * + * Use @work to get the device object the work is to be done for, determine what + * is to be done and execute the appropriate run-time PM function. + */ +static void pm_runtime_work(struct work_struct *work) +{ + struct device *dev = container_of(work, struct device, power.work); + enum rpm_request req; + + spin_lock_irq(&dev->power.lock); + + if (!dev->power.request_pending) + goto out; + + req = dev->power.request; + dev->power.request = RPM_REQ_NONE; + dev->power.request_pending = false; + + switch (req) { + case RPM_REQ_NONE: + break; + case RPM_REQ_IDLE: + __pm_runtime_idle(dev); + break; + case RPM_REQ_SUSPEND: + __pm_runtime_suspend(dev, true); + break; + case RPM_REQ_RESUME: + __pm_runtime_resume(dev, true); + break; + } + + out: + spin_unlock_irq(&dev->power.lock); +} + +/** + * __pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + * + * Check if the device's run-time PM status is correct for suspending the device + * and queue up a request to run __pm_runtime_idle() for it. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_idle(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + retval = -EINVAL; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0 + || dev->power.runtime_status == RPM_SUSPENDED + || dev->power.runtime_status == RPM_SUSPENDING) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + return retval; + + if (dev->power.request_pending) { + /* Any requests other then RPM_REQ_IDLE take precedence. */ + if (dev->power.request == RPM_REQ_NONE) + dev->power.request = RPM_REQ_IDLE; + else if (dev->power.request != RPM_REQ_IDLE) + retval = -EAGAIN; + return retval; + } + + dev->power.request = RPM_REQ_IDLE; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_idle - Submit an idle notification request for given device. + * @dev: Device to handle. + */ +int pm_request_idle(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_idle(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_idle); + +/** + * __pm_request_suspend - Submit a suspend request for given device. + * @dev: Device to suspend. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_suspend(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but we can + * overtake any other pending request. + */ + if (dev->power.request == RPM_REQ_RESUME) + retval = -EAGAIN; + else if (dev->power.request != RPM_REQ_SUSPEND) + dev->power.request = retval ? + RPM_REQ_NONE : RPM_REQ_SUSPEND; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_SUSPEND; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return 0; +} + +/** + * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). + * @data: Device pointer passed by pm_schedule_suspend(). + * + * Check if the time is right and execute __pm_request_suspend() in that case. + */ +static void pm_suspend_timer_fn(unsigned long data) +{ + struct device *dev = (struct device *)data; + unsigned long flags; + unsigned long expires; + + spin_lock_irqsave(&dev->power.lock, flags); + + expires = dev->power.timer_expires; + /* If 'expire' is after 'jiffies' we've been called too early. */ + if (expires > 0 && !time_after(expires, jiffies)) { + dev->power.timer_expires = 0; + __pm_request_suspend(dev); + } + + spin_unlock_irqrestore(&dev->power.lock, flags); +} + +/** + * pm_schedule_suspend - Set up a timer to submit a suspend request in future. + * @dev: Device to suspend. + * @delay: Time to wait before submitting a suspend request, in milliseconds. + */ +int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + unsigned long flags; + int retval = 0; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.runtime_error) { + retval = -EINVAL; + goto out; + } + + if (!delay) { + retval = __pm_request_suspend(dev); + goto out; + } + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* + * Pending resume requests take precedence over us, but any + * other pending requests have to be canceled. + */ + if (dev->power.request == RPM_REQ_RESUME) { + retval = -EAGAIN; + goto out; + } + dev->power.request = RPM_REQ_NONE; + } + + if (dev->power.runtime_status == RPM_SUSPENDED) + retval = 1; + else if (dev->power.runtime_status == RPM_SUSPENDING) + retval = -EINPROGRESS; + else if (atomic_read(&dev->power.usage_count) > 0 + || dev->power.disable_depth > 0) + retval = -EAGAIN; + else if (!pm_children_suspended(dev)) + retval = -EBUSY; + if (retval) + goto out; + + dev->power.timer_expires = jiffies + msecs_to_jiffies(delay); + mod_timer(&dev->power.suspend_timer, dev->power.timer_expires); + + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_schedule_suspend); + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + * + * This function must be called under dev->power.lock with interrupts disabled. + */ +static int __pm_request_resume(struct device *dev) +{ + int retval = 0; + + if (dev->power.runtime_error) + return -EINVAL; + + if (dev->power.runtime_status == RPM_ACTIVE) + retval = 1; + else if (dev->power.runtime_status == RPM_RESUMING) + retval = -EINPROGRESS; + else if (dev->power.disable_depth > 0) + retval = -EAGAIN; + if (retval < 0) + return retval; + + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + /* If non-resume request is pending, we can overtake it. */ + dev->power.request = retval ? RPM_REQ_NONE : RPM_REQ_RESUME; + return retval; + } else if (retval) { + return retval; + } + + dev->power.request = RPM_REQ_RESUME; + dev->power.request_pending = true; + queue_work(pm_wq, &dev->power.work); + + return retval; +} + +/** + * pm_request_resume - Submit a resume request for given device. + * @dev: Device to resume. + */ +int pm_request_resume(struct device *dev) +{ + unsigned long flags; + int retval; + + spin_lock_irqsave(&dev->power.lock, flags); + retval = __pm_request_resume(dev); + spin_unlock_irqrestore(&dev->power.lock, flags); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_request_resume); + +/** + * __pm_runtime_get - Reference count a device and wake it up, if necessary. + * @dev: Device to handle. + * @sync: If set and the device is suspended, resume it synchronously. + * + * Increment the usage count of the device and if it was zero previously, + * resume it or submit a resume request for it, depending on the value of @sync. + */ +int __pm_runtime_get(struct device *dev, bool sync) +{ + int retval = 1; + + if (atomic_add_return(1, &dev->power.usage_count) == 1) + retval = sync ? pm_runtime_resume(dev) : pm_request_resume(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_get); + +/** + * __pm_runtime_put - Decrement the device's usage counter and notify its bus. + * @dev: Device to handle. + * @sync: If the device's bus type is to be notified, do that synchronously. + * + * Decrement the usage count of the device and if it reaches zero, carry out a + * synchronous idle notification or submit an idle notification request for it, + * depending on the value of @sync. + */ +int __pm_runtime_put(struct device *dev, bool sync) +{ + int retval = 0; + + if (atomic_dec_and_test(&dev->power.usage_count)) + retval = sync ? pm_runtime_idle(dev) : pm_request_idle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(__pm_runtime_put); + +/** + * __pm_runtime_set_status - Set run-time PM status of a device. + * @dev: Device to handle. + * @status: New run-time PM status of the device. + * + * If run-time PM of the device is disabled or its power.runtime_error field is + * different from zero, the status may be changed either to RPM_ACTIVE, or to + * RPM_SUSPENDED, as long as that reflects the actual state of the device. + * However, if the device has a parent and the parent is not active, and the + * parent's power.ignore_children flag is unset, the device's status cannot be + * set to RPM_ACTIVE, so -EBUSY is returned in that case. + * + * If successful, __pm_runtime_set_status() clears the power.runtime_error field + * and the device parent's counter of unsuspended children is modified to + * reflect the new status. If the new status is RPM_SUSPENDED, an idle + * notification request for the parent is submitted. + */ +int __pm_runtime_set_status(struct device *dev, unsigned int status) +{ + struct device *parent = dev->parent; + unsigned long flags; + bool notify_parent = false; + int error = 0; + + if (status != RPM_ACTIVE && status != RPM_SUSPENDED) + return -EINVAL; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (!dev->power.runtime_error && !dev->power.disable_depth) { + error = -EAGAIN; + goto out; + } + + if (dev->power.runtime_status == status) + goto out_set; + + if (status == RPM_SUSPENDED) { + /* It always is possible to set the status to 'suspended'. */ + if (parent) { + atomic_add_unless(&parent->power.child_count, -1, 0); + notify_parent = !parent->power.ignore_children; + } + goto out_set; + } + + if (parent) { + spin_lock_irq(&parent->power.lock); + + /* + * It is invalid to put an active child under a parent that is + * not active, has run-time PM enabled and the + * 'power.ignore_children' flag unset. + */ + if (!parent->power.disable_depth + && !parent->power.ignore_children + && parent->power.runtime_status != RPM_ACTIVE) { + error = -EBUSY; + } else { + if (dev->power.runtime_status == RPM_SUSPENDED) + atomic_inc(&parent->power.child_count); + } + + spin_unlock_irq(&parent->power.lock); + + if (error) + goto out; + } + + out_set: + dev->power.runtime_status = status; + dev->power.runtime_error = 0; + out: + spin_unlock_irqrestore(&dev->power.lock, flags); + + if (notify_parent) + pm_request_idle(parent); + + return error; +} +EXPORT_SYMBOL_GPL(__pm_runtime_set_status); + +/** + * __pm_runtime_barrier - Cancel pending requests and wait for completions. + * @dev: Device to handle. + * + * Flush all pending requests for the device from pm_wq and wait for all + * run-time PM operations involving the device in progress to complete. + * + * Should be called under dev->power.lock with interrupts disabled. + */ +static void __pm_runtime_barrier(struct device *dev) +{ + pm_runtime_deactivate_timer(dev); + + if (dev->power.request_pending) { + dev->power.request = RPM_REQ_NONE; + spin_unlock_irq(&dev->power.lock); + + cancel_work_sync(&dev->power.work); + + spin_lock_irq(&dev->power.lock); + dev->power.request_pending = false; + } + + if (dev->power.runtime_status == RPM_SUSPENDING + || dev->power.runtime_status == RPM_RESUMING + || dev->power.idle_notification) { + DEFINE_WAIT(wait); + + /* Suspend, wake-up or idle notification in progress. */ + for (;;) { + prepare_to_wait(&dev->power.wait_queue, &wait, + TASK_UNINTERRUPTIBLE); + if (dev->power.runtime_status != RPM_SUSPENDING + && dev->power.runtime_status != RPM_RESUMING + && !dev->power.idle_notification) + break; + spin_unlock_irq(&dev->power.lock); + + schedule(); + + spin_lock_irq(&dev->power.lock); + } + finish_wait(&dev->power.wait_queue, &wait); + } +} + +/** + * pm_runtime_barrier - Flush pending requests and wait for completions. + * @dev: Device to handle. + * + * Prevent the device from being suspended by incrementing its usage counter and + * if there's a pending resume request for the device, wake the device up. + * Next, make sure that all pending requests for the device have been flushed + * from pm_wq and wait for all run-time PM operations involving the device in + * progress to complete. + * + * Return value: + * 1, if there was a resume request pending and the device had to be woken up, + * 0, otherwise + */ +int pm_runtime_barrier(struct device *dev) +{ + int retval = 0; + + pm_runtime_get_noresume(dev); + spin_lock_irq(&dev->power.lock); + + if (dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + __pm_runtime_resume(dev, false); + retval = 1; + } + + __pm_runtime_barrier(dev); + + spin_unlock_irq(&dev->power.lock); + pm_runtime_put_noidle(dev); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_runtime_barrier); + +/** + * __pm_runtime_disable - Disable run-time PM of a device. + * @dev: Device to handle. + * @check_resume: If set, check if there's a resume request for the device. + * + * Increment power.disable_depth for the device and if was zero previously, + * cancel all pending run-time PM requests for the device and wait for all + * operations in progress to complete. The device can be either active or + * suspended after its run-time PM has been disabled. + * + * If @check_resume is set and there's a resume request pending when + * __pm_runtime_disable() is called and power.disable_depth is zero, the + * function will wake up the device before disabling its run-time PM. + */ +void __pm_runtime_disable(struct device *dev, bool check_resume) +{ + spin_lock_irq(&dev->power.lock); + + if (dev->power.disable_depth > 0) { + dev->power.disable_depth++; + goto out; + } + + /* + * Wake up the device if there's a resume request pending, because that + * means there probably is some I/O to process and disabling run-time PM + * shouldn't prevent the device from processing the I/O. + */ + if (check_resume && dev->power.request_pending + && dev->power.request == RPM_REQ_RESUME) { + /* + * Prevent suspends and idle notifications from being carried + * out after we have woken up the device. + */ + pm_runtime_get_noresume(dev); + + __pm_runtime_resume(dev, false); + + pm_runtime_put_noidle(dev); + } + + if (!dev->power.disable_depth++) + __pm_runtime_barrier(dev); + + out: + spin_unlock_irq(&dev->power.lock); +} +EXPORT_SYMBOL_GPL(__pm_runtime_disable); + +/** + * pm_runtime_enable - Enable run-time PM of a device. + * @dev: Device to handle. + */ +void pm_runtime_enable(struct device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->power.lock, flags); + + if (dev->power.disable_depth > 0) + dev->power.disable_depth--; + else + dev_warn(dev, "Unbalanced %s!\n", __func__); + + spin_unlock_irqrestore(&dev->power.lock, flags); +} +EXPORT_SYMBOL_GPL(pm_runtime_enable); + +/** + * pm_runtime_init - Initialize run-time PM fields in given device object. + * @dev: Device object to initialize. + */ +void pm_runtime_init(struct device *dev) +{ + spin_lock_init(&dev->power.lock); + + dev->power.runtime_status = RPM_SUSPENDED; + dev->power.idle_notification = false; + + dev->power.disable_depth = 1; + atomic_set(&dev->power.usage_count, 0); + + dev->power.runtime_error = 0; + + atomic_set(&dev->power.child_count, 0); + pm_suspend_ignore_children(dev, false); + + dev->power.request_pending = false; + dev->power.request = RPM_REQ_NONE; + dev->power.deferred_resume = false; + INIT_WORK(&dev->power.work, pm_runtime_work); + + dev->power.timer_expires = 0; + setup_timer(&dev->power.suspend_timer, pm_suspend_timer_fn, + (unsigned long)dev); + + init_waitqueue_head(&dev->power.wait_queue); +} + +/** + * pm_runtime_remove - Prepare for removing a device from device hierarchy. + * @dev: Device object being removed from device hierarchy. + */ +void pm_runtime_remove(struct device *dev) +{ + __pm_runtime_disable(dev, false); + + /* Change the status back to 'suspended' to match the initial status. */ + if (dev->power.runtime_status == RPM_ACTIVE) + pm_runtime_set_suspended(dev); +} diff --git a/include/linux/pm.h b/include/linux/pm.h index b3f74764a586..2b6e20df0e52 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -22,6 +22,10 @@ #define _LINUX_PM_H #include +#include +#include +#include +#include /* * Callbacks for platform drivers to implement. @@ -165,6 +169,28 @@ typedef struct pm_message { * It is allowed to unregister devices while the above callbacks are being * executed. However, it is not allowed to unregister a device from within any * of its own callbacks. + * + * There also are the following callbacks related to run-time power management + * of devices: + * + * @runtime_suspend: Prepare the device for a condition in which it won't be + * able to communicate with the CPU(s) and RAM due to power management. + * This need not mean that the device should be put into a low power state. + * For example, if the device is behind a link which is about to be turned + * off, the device may remain at full power. If the device does go to low + * power and if device_may_wakeup(dev) is true, remote wake-up (i.e., a + * hardware mechanism allowing the device to request a change of its power + * state, such as PCI PME) should be enabled for it. + * + * @runtime_resume: Put the device into the fully active state in response to a + * wake-up event generated by hardware or at the request of software. If + * necessary, put the device into the full power state and restore its + * registers, so that it is fully operational. + * + * @runtime_idle: Device appears to be inactive and it might be put into a low + * power state if all of the necessary conditions are satisfied. Check + * these conditions and handle the device as appropriate, possibly queueing + * a suspend request for it. The return value is ignored by the PM core. */ struct dev_pm_ops { @@ -182,6 +208,9 @@ struct dev_pm_ops { int (*thaw_noirq)(struct device *dev); int (*poweroff_noirq)(struct device *dev); int (*restore_noirq)(struct device *dev); + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + int (*runtime_idle)(struct device *dev); }; /** @@ -315,14 +344,80 @@ enum dpm_state { DPM_OFF_IRQ, }; +/** + * Device run-time power management status. + * + * These status labels are used internally by the PM core to indicate the + * current status of a device with respect to the PM core operations. They do + * not reflect the actual power state of the device or its status as seen by the + * driver. + * + * RPM_ACTIVE Device is fully operational. Indicates that the device + * bus type's ->runtime_resume() callback has completed + * successfully. + * + * RPM_SUSPENDED Device bus type's ->runtime_suspend() callback has + * completed successfully. The device is regarded as + * suspended. + * + * RPM_RESUMING Device bus type's ->runtime_resume() callback is being + * executed. + * + * RPM_SUSPENDING Device bus type's ->runtime_suspend() callback is being + * executed. + */ + +enum rpm_status { + RPM_ACTIVE = 0, + RPM_RESUMING, + RPM_SUSPENDED, + RPM_SUSPENDING, +}; + +/** + * Device run-time power management request types. + * + * RPM_REQ_NONE Do nothing. + * + * RPM_REQ_IDLE Run the device bus type's ->runtime_idle() callback + * + * RPM_REQ_SUSPEND Run the device bus type's ->runtime_suspend() callback + * + * RPM_REQ_RESUME Run the device bus type's ->runtime_resume() callback + */ + +enum rpm_request { + RPM_REQ_NONE = 0, + RPM_REQ_IDLE, + RPM_REQ_SUSPEND, + RPM_REQ_RESUME, +}; + struct dev_pm_info { pm_message_t power_state; - unsigned can_wakeup:1; - unsigned should_wakeup:1; + unsigned int can_wakeup:1; + unsigned int should_wakeup:1; enum dpm_state status; /* Owned by the PM core */ -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_PM_SLEEP struct list_head entry; #endif +#ifdef CONFIG_PM_RUNTIME + struct timer_list suspend_timer; + unsigned long timer_expires; + struct work_struct work; + wait_queue_head_t wait_queue; + spinlock_t lock; + atomic_t usage_count; + atomic_t child_count; + unsigned int disable_depth:3; + unsigned int ignore_children:1; + unsigned int idle_notification:1; + unsigned int request_pending:1; + unsigned int deferred_resume:1; + enum rpm_request request; + enum rpm_status runtime_status; + int runtime_error; +#endif }; /* diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h new file mode 100644 index 000000000000..44087044910f --- /dev/null +++ b/include/linux/pm_runtime.h @@ -0,0 +1,114 @@ +/* + * pm_runtime.h - Device run-time power management helper functions. + * + * Copyright (C) 2009 Rafael J. Wysocki + * + * This file is released under the GPLv2. + */ + +#ifndef _LINUX_PM_RUNTIME_H +#define _LINUX_PM_RUNTIME_H + +#include +#include + +#ifdef CONFIG_PM_RUNTIME + +extern struct workqueue_struct *pm_wq; + +extern int pm_runtime_idle(struct device *dev); +extern int pm_runtime_suspend(struct device *dev); +extern int pm_runtime_resume(struct device *dev); +extern int pm_request_idle(struct device *dev); +extern int pm_schedule_suspend(struct device *dev, unsigned int delay); +extern int pm_request_resume(struct device *dev); +extern int __pm_runtime_get(struct device *dev, bool sync); +extern int __pm_runtime_put(struct device *dev, bool sync); +extern int __pm_runtime_set_status(struct device *dev, unsigned int status); +extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_enable(struct device *dev); +extern void __pm_runtime_disable(struct device *dev, bool check_resume); + +static inline bool pm_children_suspended(struct device *dev) +{ + return dev->power.ignore_children + || !atomic_read(&dev->power.child_count); +} + +static inline void pm_suspend_ignore_children(struct device *dev, bool enable) +{ + dev->power.ignore_children = enable; +} + +static inline void pm_runtime_get_noresume(struct device *dev) +{ + atomic_inc(&dev->power.usage_count); +} + +static inline void pm_runtime_put_noidle(struct device *dev) +{ + atomic_add_unless(&dev->power.usage_count, -1, 0); +} + +#else /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_suspend(struct device *dev) { return -ENOSYS; } +static inline int pm_runtime_resume(struct device *dev) { return 0; } +static inline int pm_request_idle(struct device *dev) { return -ENOSYS; } +static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) +{ + return -ENOSYS; +} +static inline int pm_request_resume(struct device *dev) { return 0; } +static inline int __pm_runtime_get(struct device *dev, bool sync) { return 1; } +static inline int __pm_runtime_put(struct device *dev, bool sync) { return 0; } +static inline int __pm_runtime_set_status(struct device *dev, + unsigned int status) { return 0; } +static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_enable(struct device *dev) {} +static inline void __pm_runtime_disable(struct device *dev, bool c) {} + +static inline bool pm_children_suspended(struct device *dev) { return false; } +static inline void pm_suspend_ignore_children(struct device *dev, bool en) {} +static inline void pm_runtime_get_noresume(struct device *dev) {} +static inline void pm_runtime_put_noidle(struct device *dev) {} + +#endif /* !CONFIG_PM_RUNTIME */ + +static inline int pm_runtime_get(struct device *dev) +{ + return __pm_runtime_get(dev, false); +} + +static inline int pm_runtime_get_sync(struct device *dev) +{ + return __pm_runtime_get(dev, true); +} + +static inline int pm_runtime_put(struct device *dev) +{ + return __pm_runtime_put(dev, false); +} + +static inline int pm_runtime_put_sync(struct device *dev) +{ + return __pm_runtime_put(dev, true); +} + +static inline int pm_runtime_set_active(struct device *dev) +{ + return __pm_runtime_set_status(dev, RPM_ACTIVE); +} + +static inline void pm_runtime_set_suspended(struct device *dev) +{ + __pm_runtime_set_status(dev, RPM_SUSPENDED); +} + +static inline void pm_runtime_disable(struct device *dev) +{ + __pm_runtime_disable(dev, true); +} + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cbdb37f..91e09d3b2eb2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36930cc..347d2cc88cd0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; -- cgit v1.2.3-71-gd317 From 163f52b6cf3a639df6a72c7937e0eb88b20f1ef3 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sat, 1 Aug 2009 00:39:36 +0000 Subject: [SCSI] ses: fix hotplug with multiple devices and expanders In a situation either with expanders or with multiple enclosure devices, hot add doesn't always work. This is because we try to find a single enclosure device attached to the host. Fix this by looping over all enclosure devices attached to the host and also by making the find loop recognise that the enclosure devices may be expander remote (i.e. not parented by the host). Signed-off-by: James Bottomley Signed-off-by: James Bottomley --- drivers/misc/enclosure.c | 44 ++++++++++++++++++++++++++++++++------------ drivers/scsi/ses.c | 10 ++++++---- include/linux/enclosure.h | 3 ++- 3 files changed, 40 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c index 348443bdb23b..789d12128c24 100644 --- a/drivers/misc/enclosure.c +++ b/drivers/misc/enclosure.c @@ -33,24 +33,44 @@ static DEFINE_MUTEX(container_list_lock); static struct class enclosure_class; /** - * enclosure_find - find an enclosure given a device - * @dev: the device to find for + * enclosure_find - find an enclosure given a parent device + * @dev: the parent to match against + * @start: Optional enclosure device to start from (NULL if none) * - * Looks through the list of registered enclosures to see - * if it can find a match for a device. Returns NULL if no - * enclosure is found. Obtains a reference to the enclosure class - * device which must be released with device_put(). + * Looks through the list of registered enclosures to find all those + * with @dev as a parent. Returns NULL if no enclosure is + * found. @start can be used as a starting point to obtain multiple + * enclosures per parent (should begin with NULL and then be set to + * each returned enclosure device). Obtains a reference to the + * enclosure class device which must be released with device_put(). + * If @start is not NULL, a reference must be taken on it which is + * released before returning (this allows a loop through all + * enclosures to exit with only the reference on the enclosure of + * interest held). Note that the @dev may correspond to the actual + * device housing the enclosure, in which case no iteration via @start + * is required. */ -struct enclosure_device *enclosure_find(struct device *dev) +struct enclosure_device *enclosure_find(struct device *dev, + struct enclosure_device *start) { struct enclosure_device *edev; mutex_lock(&container_list_lock); - list_for_each_entry(edev, &container_list, node) { - if (edev->edev.parent == dev) { - get_device(&edev->edev); - mutex_unlock(&container_list_lock); - return edev; + edev = list_prepare_entry(start, &container_list, node); + if (start) + put_device(&start->edev); + + list_for_each_entry_continue(edev, &container_list, node) { + struct device *parent = edev->edev.parent; + /* parent might not be immediate, so iterate up to + * the root of the tree if necessary */ + while (parent) { + if (parent == dev) { + get_device(&edev->edev); + mutex_unlock(&container_list_lock); + return edev; + } + parent = parent->parent; } } mutex_unlock(&container_list_lock); diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index 4f618f487356..e1b8c828f03a 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -413,10 +413,11 @@ static int ses_intf_add(struct device *cdev, if (!scsi_device_enclosure(sdev)) { /* not an enclosure, but might be in one */ - edev = enclosure_find(&sdev->host->shost_gendev); - if (edev) { + struct enclosure_device *prev = NULL; + + while ((edev = enclosure_find(&sdev->host->shost_gendev, prev)) != NULL) { ses_match_to_enclosure(edev, sdev); - put_device(&edev->edev); + prev = edev; } return -ENODEV; } @@ -625,7 +626,8 @@ static void ses_intf_remove(struct device *cdev, if (!scsi_device_enclosure(sdev)) return; - edev = enclosure_find(cdev->parent); + /* exact match to this enclosure */ + edev = enclosure_find(cdev->parent, NULL); if (!edev) return; diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h index 4332442b1b57..d77811e9ed84 100644 --- a/include/linux/enclosure.h +++ b/include/linux/enclosure.h @@ -123,7 +123,8 @@ enclosure_component_register(struct enclosure_device *, unsigned int, int enclosure_add_device(struct enclosure_device *enclosure, int component, struct device *dev); int enclosure_remove_device(struct enclosure_device *enclosure, int component); -struct enclosure_device *enclosure_find(struct device *dev); +struct enclosure_device *enclosure_find(struct device *dev, + struct enclosure_device *start); int enclosure_for_each_device(int (*fn)(struct enclosure_device *, void *), void *data); -- cgit v1.2.3-71-gd317 From 43d8eb9cfd0aea93be32181c64e18191b69c211c Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sat, 1 Aug 2009 00:41:22 +0000 Subject: [SCSI] ses: add support for enclosure component hot removal Right at the moment, hot removal of a device within an enclosure does nothing (because the intf_remove only copes with enclosure removal not with component removal). Fix this by adding a function to remove the component. Also needed to fix the prototype of enclosure_remove_device, since we know the device we've removed but not the internal component number Signed-off-by: James Bottomley Signed-off-by: James Bottomley --- drivers/misc/enclosure.c | 22 ++++++++++++++-------- drivers/scsi/ses.c | 33 ++++++++++++++++++++++++++------- include/linux/enclosure.h | 2 +- 3 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c index 789d12128c24..850706a5e553 100644 --- a/drivers/misc/enclosure.c +++ b/drivers/misc/enclosure.c @@ -332,19 +332,25 @@ EXPORT_SYMBOL_GPL(enclosure_add_device); * Returns zero on success or an error. * */ -int enclosure_remove_device(struct enclosure_device *edev, int component) +int enclosure_remove_device(struct enclosure_device *edev, struct device *dev) { struct enclosure_component *cdev; + int i; - if (!edev || component >= edev->components) + if (!edev || !dev) return -EINVAL; - cdev = &edev->component[component]; - - device_del(&cdev->cdev); - put_device(cdev->dev); - cdev->dev = NULL; - return device_add(&cdev->cdev); + for (i = 0; i < edev->components; i++) { + cdev = &edev->component[i]; + if (cdev->dev == dev) { + enclosure_remove_links(cdev); + device_del(&cdev->cdev); + put_device(dev); + cdev->dev = NULL; + return device_add(&cdev->cdev); + } + } + return -ENODEV; } EXPORT_SYMBOL_GPL(enclosure_remove_device); diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c index e1b8c828f03a..be593c8525b5 100644 --- a/drivers/scsi/ses.c +++ b/drivers/scsi/ses.c @@ -616,18 +616,26 @@ static int ses_remove(struct device *dev) return 0; } -static void ses_intf_remove(struct device *cdev, - struct class_interface *intf) +static void ses_intf_remove_component(struct scsi_device *sdev) +{ + struct enclosure_device *edev, *prev = NULL; + + while ((edev = enclosure_find(&sdev->host->shost_gendev, prev)) != NULL) { + prev = edev; + if (!enclosure_remove_device(edev, &sdev->sdev_gendev)) + break; + } + if (edev) + put_device(&edev->edev); +} + +static void ses_intf_remove_enclosure(struct scsi_device *sdev) { - struct scsi_device *sdev = to_scsi_device(cdev->parent); struct enclosure_device *edev; struct ses_device *ses_dev; - if (!scsi_device_enclosure(sdev)) - return; - /* exact match to this enclosure */ - edev = enclosure_find(cdev->parent, NULL); + edev = enclosure_find(&sdev->sdev_gendev, NULL); if (!edev) return; @@ -645,6 +653,17 @@ static void ses_intf_remove(struct device *cdev, enclosure_unregister(edev); } +static void ses_intf_remove(struct device *cdev, + struct class_interface *intf) +{ + struct scsi_device *sdev = to_scsi_device(cdev->parent); + + if (!scsi_device_enclosure(sdev)) + ses_intf_remove_component(sdev); + else + ses_intf_remove_enclosure(sdev); +} + static struct class_interface ses_interface = { .add_dev = ses_intf_add, .remove_dev = ses_intf_remove, diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h index d77811e9ed84..90d1c2184112 100644 --- a/include/linux/enclosure.h +++ b/include/linux/enclosure.h @@ -122,7 +122,7 @@ enclosure_component_register(struct enclosure_device *, unsigned int, enum enclosure_component_type, const char *); int enclosure_add_device(struct enclosure_device *enclosure, int component, struct device *dev); -int enclosure_remove_device(struct enclosure_device *enclosure, int component); +int enclosure_remove_device(struct enclosure_device *, struct device *); struct enclosure_device *enclosure_find(struct device *dev, struct enclosure_device *start); int enclosure_for_each_device(int (*fn)(struct enclosure_device *, void *), -- cgit v1.2.3-71-gd317 From 05ecd5a1f76c183cca381705b3adb7d77c9a0439 Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Mon, 24 Aug 2009 19:52:38 +0900 Subject: sh: Simplify "multi-evt" interrupt handling. This patch changes the way in which "multi-evt" interrups are handled. The intc_evt2irq_table and related intc_evt2irq() have been removed and the "redirecting" handler is installed for the coupled interrupts. Thanks to that the do_IRQ() function don't have to use another level of indirection for all the interrupts... Signed-off-by: Pawel Moll Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- arch/sh/kernel/irq.c | 2 +- drivers/sh/intc.c | 54 ++++++++++++++++--------------------------------- include/linux/sh_intc.h | 1 - 3 files changed, 18 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 278c68c60488..d1053392e287 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -114,7 +114,7 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs) #endif irq_enter(); - irq = irq_demux(intc_evt2irq(irq)); + irq = irq_demux(evt2irq(irq)); #ifdef CONFIG_IRQSTACKS curctx = (union irq_ctx *)current_thread_info(); diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c index 4b1ca9d28353..a9174ec72853 100644 --- a/drivers/sh/intc.c +++ b/drivers/sh/intc.c @@ -663,16 +663,9 @@ static unsigned int __init save_reg(struct intc_desc_int *d, return 0; } -static unsigned char *intc_evt2irq_table; - -unsigned int intc_evt2irq(unsigned int vector) +static void intc_redirect_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int irq = evt2irq(vector); - - if (intc_evt2irq_table && intc_evt2irq_table[irq]) - irq = intc_evt2irq_table[irq]; - - return irq; + generic_handle_irq((unsigned int)get_irq_data(irq)); } void __init register_intc_controller(struct intc_desc *desc) @@ -745,34 +738,6 @@ void __init register_intc_controller(struct intc_desc *desc) BUG_ON(k > 256); /* _INTC_ADDR_E() and _INTC_ADDR_D() are 8 bits */ - /* keep the first vector only if same enum is used multiple times */ - for (i = 0; i < desc->nr_vectors; i++) { - struct intc_vect *vect = desc->vectors + i; - int first_irq = evt2irq(vect->vect); - - if (!vect->enum_id) - continue; - - for (k = i + 1; k < desc->nr_vectors; k++) { - struct intc_vect *vect2 = desc->vectors + k; - - if (vect->enum_id != vect2->enum_id) - continue; - - vect2->enum_id = 0; - - if (!intc_evt2irq_table) - intc_evt2irq_table = kzalloc(NR_IRQS, GFP_NOWAIT); - - if (!intc_evt2irq_table) { - pr_warning("intc: cannot allocate evt2irq!\n"); - continue; - } - - intc_evt2irq_table[evt2irq(vect2->vect)] = first_irq; - } - } - /* register the vectors one by one */ for (i = 0; i < desc->nr_vectors; i++) { struct intc_vect *vect = desc->vectors + i; @@ -789,6 +754,21 @@ void __init register_intc_controller(struct intc_desc *desc) } intc_register_irq(desc, d, vect->enum_id, irq); + + for (k = i + 1; k < desc->nr_vectors; k++) { + struct intc_vect *vect2 = desc->vectors + k; + unsigned int irq2 = evt2irq(vect2->vect); + + if (vect->enum_id != vect2->enum_id) + continue; + + vect2->enum_id = 0; + + /* redirect this interrupts to the first one */ + set_irq_chip_and_handler_name(irq2, &d->chip, + intc_redirect_irq, "redirect"); + set_irq_data(irq2, (void *)irq); + } } } diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h index eb1423a0078d..68e212ff9dde 100644 --- a/include/linux/sh_intc.h +++ b/include/linux/sh_intc.h @@ -85,7 +85,6 @@ struct intc_desc symbol __initdata = { \ } #endif -unsigned int intc_evt2irq(unsigned int vector); void __init register_intc_controller(struct intc_desc *desc); int intc_set_priority(unsigned int irq, unsigned int prio); -- cgit v1.2.3-71-gd317 From 0396c215f301e92677d1e9a064b405e31501dc1d Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 25 Aug 2009 16:41:06 +0100 Subject: uwb: avoid radio controller reset loops If a radio controller reset attempt occurs while a probe() or remove() is in progress it fails and is retried endlessly, potentially preventing the probe() or remove() from completing. If a reset fails, sleep for a bit before retrying the reset. This allows the probe()/remove() to complete. Signed-off-by: David Vrabel --- drivers/uwb/hwa-rc.c | 3 +-- drivers/uwb/reset.c | 21 +++++++++++---------- drivers/uwb/umc-bus.c | 2 +- drivers/uwb/whc-rc.c | 3 +-- include/linux/uwb.h | 2 +- 5 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index 9052bcb4f528..e7eeb63fab23 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -887,8 +887,7 @@ static int hwarc_post_reset(struct usb_interface *iface) struct hwarc *hwarc = usb_get_intfdata(iface); struct uwb_rc *uwb_rc = hwarc->uwb_rc; - uwb_rc_post_reset(uwb_rc); - return 0; + return uwb_rc_post_reset(uwb_rc); } /** USB device ID's that we handle */ diff --git a/drivers/uwb/reset.c b/drivers/uwb/reset.c index 70f8050221ff..7f0512e43d9d 100644 --- a/drivers/uwb/reset.c +++ b/drivers/uwb/reset.c @@ -30,6 +30,7 @@ */ #include #include +#include #include "uwb-internal.h" @@ -323,13 +324,15 @@ int uwbd_msg_handle_reset(struct uwb_event *evt) dev_info(&rc->uwb_dev.dev, "resetting radio controller\n"); ret = rc->reset(rc); - if (ret) { + if (ret < 0) { dev_err(&rc->uwb_dev.dev, "failed to reset hardware: %d\n", ret); goto error; } return 0; error: - /* Nothing can be done except try the reset again. */ + /* Nothing can be done except try the reset again. Wait a bit + to avoid reset loops during probe() or remove(). */ + msleep(1000); uwb_rc_reset_all(rc); return ret; } @@ -368,22 +371,20 @@ void uwb_rc_pre_reset(struct uwb_rc *rc) } EXPORT_SYMBOL_GPL(uwb_rc_pre_reset); -void uwb_rc_post_reset(struct uwb_rc *rc) +int uwb_rc_post_reset(struct uwb_rc *rc) { int ret; ret = rc->start(rc); if (ret) - goto error; + goto out; ret = uwb_rc_mac_addr_set(rc, &rc->uwb_dev.mac_addr); if (ret) - goto error; + goto out; ret = uwb_rc_dev_addr_set(rc, &rc->uwb_dev.dev_addr); if (ret) - goto error; - return; -error: - /* Nothing can be done except try the reset again. */ - uwb_rc_reset_all(rc); + goto out; +out: + return ret; } EXPORT_SYMBOL_GPL(uwb_rc_post_reset); diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c index 5ad36164c13b..cdd6c8efc9f8 100644 --- a/drivers/uwb/umc-bus.c +++ b/drivers/uwb/umc-bus.c @@ -66,7 +66,7 @@ int umc_controller_reset(struct umc_dev *umc) return -EAGAIN; ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper); if (ret >= 0) - device_for_each_child(parent, parent, umc_bus_post_reset_helper); + ret = device_for_each_child(parent, parent, umc_bus_post_reset_helper); up(&parent->sem); return ret; diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c index 19a1dd129212..1d9a6f54658e 100644 --- a/drivers/uwb/whc-rc.c +++ b/drivers/uwb/whc-rc.c @@ -443,8 +443,7 @@ static int whcrc_post_reset(struct umc_dev *umc) struct whcrc *whcrc = umc_get_drvdata(umc); struct uwb_rc *uwb_rc = whcrc->uwb_rc; - uwb_rc_post_reset(uwb_rc); - return 0; + return uwb_rc_post_reset(uwb_rc); } /* PCI device ID's that we handle [so it gets loaded] */ diff --git a/include/linux/uwb.h b/include/linux/uwb.h index c02128991ff7..7fc9746f22cd 100644 --- a/include/linux/uwb.h +++ b/include/linux/uwb.h @@ -597,7 +597,7 @@ void uwb_rc_neh_grok(struct uwb_rc *, void *, size_t); void uwb_rc_neh_error(struct uwb_rc *, int); void uwb_rc_reset_all(struct uwb_rc *rc); void uwb_rc_pre_reset(struct uwb_rc *rc); -void uwb_rc_post_reset(struct uwb_rc *rc); +int uwb_rc_post_reset(struct uwb_rc *rc); /** * uwb_rsv_is_owner - is the owner of this reservation the RC? -- cgit v1.2.3-71-gd317 From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:35 -0700 Subject: x86, pat: Add PAT reserve free to io_mapping* APIs io_mapping_* interfaces were added, mainly for graphics drivers. Make this interface go through the PAT reserve/free, instead of hardcoding WC mapping. This makes sure that there are no aliases due to unconditional WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iomap.h | 9 ++++++--- arch/x86/mm/iomap_32.c | 27 +++++++++++++++++++++++++-- include/linux/io-mapping.h | 17 ++++++++++++----- 3 files changed, 43 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d971..f35eb45d6576 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,13 +26,16 @@ #include #include -int -is_io_mapping_possible(resource_size_t base, unsigned long size); - void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); +int +iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +void +iomap_free(resource_size_t base, unsigned long size); + #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121e..84e236ce76ba 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,7 +21,7 @@ #include #include -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0adb0f91568c..97eb928b4924 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,23 +49,30 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; - - if (!is_io_mapping_possible(base, size)) - return NULL; + pgprot_t prot; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); if (!iomap) - return NULL; + goto out_err; + + if (iomap_create_wc(base, size, &prot)) + goto out_free; iomap->base = base; iomap->size = size; - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + iomap->prot = prot; return iomap; + +out_free: + kfree(iomap); +out_err: + return NULL; } static inline void io_mapping_free(struct io_mapping *mapping) { + iomap_free(mapping->base, mapping->size); kfree(mapping); } -- cgit v1.2.3-71-gd317 From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:37 -0700 Subject: x86, pat: Generalize the use of page flag PG_uncached Only IA64 was using PG_uncached as of now. We now intend to use this bit in x86 as well, to keep track of memory type of those addresses that have page struct for them. So, generalize the use of that bit across ia64 and x86. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/ia64/Kconfig | 4 ++++ arch/x86/Kconfig | 4 ++++ include/linux/page-flags.h | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d4..e6246119932a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR bool select GENERIC_ALLOCATOR +config ARCH_USES_PG_UNCACHED + def_bool y + depends on IA64_UNCACHED_ALLOCATOR + config AUDIT_ARCH bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f72205909..8e1595382196 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1414,6 +1414,10 @@ config X86_PAT If unsure, say Y. +config ARCH_USES_PG_UNCACHED + def_bool y + depends on X86_PAT + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e2e5ce543595..2b87acfc5f87 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,7 +99,7 @@ enum pageflags { #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif __NR_PAGEFLAGS, @@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached) #else PAGEFLAG_FALSE(Uncached) -- cgit v1.2.3-71-gd317 From f726f30e32305a34a203ff975e60885aa7556c6a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:24 +0000 Subject: dma: Add set_dma_mask hook to struct dma_map_ops POWERPC needs this hook. SPARC could use it too. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- include/linux/dma-mapping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index c0f6c3cd788c..91b761846061 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -58,6 +58,7 @@ struct dma_map_ops { enum dma_data_direction dir); int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); int (*dma_supported)(struct device *dev, u64 mask); + int (*set_dma_mask)(struct device *dev, u64 mask); int is_phys; }; -- cgit v1.2.3-71-gd317 From 77a53fd21870c726b670c0d8179294ac1ea33468 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 25 Aug 2009 19:24:13 -0700 Subject: Input: matrix-keypad - add function to build device keymap Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 15 +++------------ drivers/input/keyboard/w90p910_keypad.c | 16 ++-------------- include/linux/input/matrix_keypad.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index 541b981ff075..91cfe5170265 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -319,7 +319,6 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) struct input_dev *input_dev; unsigned short *keycodes; unsigned int row_shift; - int i; int err; pdata = pdev->dev.platform_data; @@ -363,18 +362,10 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) input_dev->keycode = keycodes; input_dev->keycodesize = sizeof(*keycodes); - input_dev->keycodemax = pdata->num_row_gpios << keypad->row_shift; - - for (i = 0; i < keymap_data->keymap_size; i++) { - unsigned int key = keymap_data->keymap[i]; - unsigned int row = KEY_ROW(key); - unsigned int col = KEY_COL(key); - unsigned short code = KEY_VAL(key); + input_dev->keycodemax = pdata->num_row_gpios << row_shift; - keycodes[MATRIX_SCAN_CODE(row, col, row_shift)] = code; - __set_bit(code, input_dev->keybit); - } - __clear_bit(KEY_RESERVED, input_dev->keybit); + matrix_keypad_build_keymap(keymap_data, row_shift, + input_dev->keycode, input_dev->keybit); input_set_capability(input_dev, EV_MSC, MSC_SCAN); input_set_drvdata(input_dev, keypad); diff --git a/drivers/input/keyboard/w90p910_keypad.c b/drivers/input/keyboard/w90p910_keypad.c index b8598ae124ee..2d03dd0f9e07 100644 --- a/drivers/input/keyboard/w90p910_keypad.c +++ b/drivers/input/keyboard/w90p910_keypad.c @@ -126,7 +126,6 @@ static int __devinit w90p910_keypad_probe(struct platform_device *pdev) struct resource *res; int irq; int error; - int i; if (!pdata) { dev_err(&pdev->dev, "no platform data defined\n"); @@ -197,19 +196,8 @@ static int __devinit w90p910_keypad_probe(struct platform_device *pdev) input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_set_capability(input_dev, EV_MSC, MSC_SCAN); - for (i = 0; i < keymap_data->keymap_size; i++) { - unsigned int key = keymap_data->keymap[i]; - unsigned int row = KEY_ROW(key); - unsigned int col = KEY_COL(key); - unsigned short keycode = KEY_VAL(key); - unsigned int scancode = MATRIX_SCAN_CODE(row, col, - W90P910_ROW_SHIFT); - - keypad->keymap[scancode] = keycode; - __set_bit(keycode, input_dev->keybit); - } - __clear_bit(KEY_RESERVED, input_dev->keybit); - + matrix_keypad_build_keymap(keymap_data, W90P910_ROW_SHIFT, + input_dev->keycode, input_dev->keybit); error = request_irq(keypad->irq, w90p910_keypad_irq_handler, IRQF_DISABLED, pdev->name, keypad); diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index 15d5903af2dd..b3cd42d50e16 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -63,4 +63,36 @@ struct matrix_keypad_platform_data { bool wakeup; }; +/** + * matrix_keypad_build_keymap - convert platform keymap into matrix keymap + * @keymap_data: keymap supplied by the platform code + * @row_shift: number of bits to shift row value by to advance to the next + * line in the keymap + * @keymap: expanded version of keymap that is suitable for use by + * matrix keyboad driver + * @keybit: pointer to bitmap of keys supported by input device + * + * This function converts platform keymap (encoded with KEY() macro) into + * an array of keycodes that is suitable for using in a standard matrix + * keyboard driver that uses row and col as indices. + */ +static inline void +matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data, + unsigned int row_shift, + unsigned short *keymap, unsigned long *keybit) +{ + int i; + + for (i = 0; i < keymap_data->keymap_size; i++) { + unsigned int key = keymap_data->keymap[i]; + unsigned int row = KEY_ROW(key); + unsigned int col = KEY_COL(key); + unsigned short code = KEY_VAL(key); + + keymap[MATRIX_SCAN_CODE(row, col, row_shift)] = code; + __set_bit(code, keybit); + } + __clear_bit(KEY_RESERVED, keybit); +} + #endif /* _MATRIX_KEYPAD_H */ -- cgit v1.2.3-71-gd317 From 9d8340687c524ce61e3c9c76758c4c81303acfc0 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 25 Aug 2009 19:24:14 -0700 Subject: Input: add twl4030_keypad driver Add a driver for the keypad controller on TWL4030 family chips. These support up to an 8x8 key matrix. The TWL4030 multifunction chips are mostly used on OMAP3 (or OMAP 2430) based boards. [dtor@mail.ru: switch to matrix-keypad framework, fix changing keymap from userspace] Reviewed-by: Trilok Soni Signed-off-by: David Brownell Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 11 + drivers/input/keyboard/Makefile | 1 + drivers/input/keyboard/twl4030_keypad.c | 480 ++++++++++++++++++++++++++++++++ include/linux/i2c/twl4030.h | 19 +- 4 files changed, 505 insertions(+), 6 deletions(-) create mode 100644 drivers/input/keyboard/twl4030_keypad.c (limited to 'include/linux') diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 50e407de8a78..3525c19be428 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -330,6 +330,17 @@ config KEYBOARD_OMAP To compile this driver as a module, choose M here: the module will be called omap-keypad. +config KEYBOARD_TWL4030 + tristate "TI TWL4030/TWL5030/TPS659x0 keypad support" + depends on TWL4030_CORE + help + Say Y here if your board use the keypad controller on + TWL4030 family chips. It's safe to say enable this + even on boards that don't use the keypad controller. + + To compile this driver as a module, choose M here: the + module will be called twl4030_keypad. + config KEYBOARD_TOSA tristate "Tosa keyboard" depends on MACH_TOSA diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 152303029203..8a7a22b30266 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -30,5 +30,6 @@ obj-$(CONFIG_KEYBOARD_SPITZ) += spitzkbd.o obj-$(CONFIG_KEYBOARD_STOWAWAY) += stowaway.o obj-$(CONFIG_KEYBOARD_SUNKBD) += sunkbd.o obj-$(CONFIG_KEYBOARD_TOSA) += tosakbd.o +obj-$(CONFIG_KEYBOARD_TWL4030) += twl4030_keypad.o obj-$(CONFIG_KEYBOARD_XTKBD) += xtkbd.o obj-$(CONFIG_KEYBOARD_W90P910) += w90p910_keypad.o diff --git a/drivers/input/keyboard/twl4030_keypad.c b/drivers/input/keyboard/twl4030_keypad.c new file mode 100644 index 000000000000..9a2977c21696 --- /dev/null +++ b/drivers/input/keyboard/twl4030_keypad.c @@ -0,0 +1,480 @@ +/* + * twl4030_keypad.c - driver for 8x8 keypad controller in twl4030 chips + * + * Copyright (C) 2007 Texas Instruments, Inc. + * Copyright (C) 2008 Nokia Corporation + * + * Code re-written for 2430SDP by: + * Syed Mohammed Khasim + * + * Initial Code: + * Manjunatha G K + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + + +/* + * The TWL4030 family chips include a keypad controller that supports + * up to an 8x8 switch matrix. The controller can issue system wakeup + * events, since it uses only the always-on 32KiHz oscillator, and has + * an internal state machine that decodes pressed keys, including + * multi-key combinations. + * + * This driver lets boards define what keycodes they wish to report for + * which scancodes, as part of the "struct twl4030_keypad_data" used in + * the probe() routine. + * + * See the TPS65950 documentation; that's the general availability + * version of the TWL5030 second generation part. + */ +#define TWL4030_MAX_ROWS 8 /* TWL4030 hard limit */ +#define TWL4030_MAX_COLS 8 +#define TWL4030_ROW_SHIFT 3 +#define TWL4030_KEYMAP_SIZE (TWL4030_MAX_ROWS * TWL4030_MAX_COLS) + +struct twl4030_keypad { + unsigned short keymap[TWL4030_KEYMAP_SIZE]; + u16 kp_state[TWL4030_MAX_ROWS]; + unsigned n_rows; + unsigned n_cols; + unsigned irq; + + struct device *dbg_dev; + struct input_dev *input; +}; + +/*----------------------------------------------------------------------*/ + +/* arbitrary prescaler value 0..7 */ +#define PTV_PRESCALER 4 + +/* Register Offsets */ +#define KEYP_CTRL 0x00 +#define KEYP_DEB 0x01 +#define KEYP_LONG_KEY 0x02 +#define KEYP_LK_PTV 0x03 +#define KEYP_TIMEOUT_L 0x04 +#define KEYP_TIMEOUT_H 0x05 +#define KEYP_KBC 0x06 +#define KEYP_KBR 0x07 +#define KEYP_SMS 0x08 +#define KEYP_FULL_CODE_7_0 0x09 /* row 0 column status */ +#define KEYP_FULL_CODE_15_8 0x0a /* ... row 1 ... */ +#define KEYP_FULL_CODE_23_16 0x0b +#define KEYP_FULL_CODE_31_24 0x0c +#define KEYP_FULL_CODE_39_32 0x0d +#define KEYP_FULL_CODE_47_40 0x0e +#define KEYP_FULL_CODE_55_48 0x0f +#define KEYP_FULL_CODE_63_56 0x10 +#define KEYP_ISR1 0x11 +#define KEYP_IMR1 0x12 +#define KEYP_ISR2 0x13 +#define KEYP_IMR2 0x14 +#define KEYP_SIR 0x15 +#define KEYP_EDR 0x16 /* edge triggers */ +#define KEYP_SIH_CTRL 0x17 + +/* KEYP_CTRL_REG Fields */ +#define KEYP_CTRL_SOFT_NRST BIT(0) +#define KEYP_CTRL_SOFTMODEN BIT(1) +#define KEYP_CTRL_LK_EN BIT(2) +#define KEYP_CTRL_TOE_EN BIT(3) +#define KEYP_CTRL_TOLE_EN BIT(4) +#define KEYP_CTRL_RP_EN BIT(5) +#define KEYP_CTRL_KBD_ON BIT(6) + +/* KEYP_DEB, KEYP_LONG_KEY, KEYP_TIMEOUT_x*/ +#define KEYP_PERIOD_US(t, prescale) ((t) / (31 << (prescale + 1)) - 1) + +/* KEYP_LK_PTV_REG Fields */ +#define KEYP_LK_PTV_PTV_SHIFT 5 + +/* KEYP_{IMR,ISR,SIR} Fields */ +#define KEYP_IMR1_MIS BIT(3) +#define KEYP_IMR1_TO BIT(2) +#define KEYP_IMR1_LK BIT(1) +#define KEYP_IMR1_KP BIT(0) + +/* KEYP_EDR Fields */ +#define KEYP_EDR_KP_FALLING 0x01 +#define KEYP_EDR_KP_RISING 0x02 +#define KEYP_EDR_KP_BOTH 0x03 +#define KEYP_EDR_LK_FALLING 0x04 +#define KEYP_EDR_LK_RISING 0x08 +#define KEYP_EDR_TO_FALLING 0x10 +#define KEYP_EDR_TO_RISING 0x20 +#define KEYP_EDR_MIS_FALLING 0x40 +#define KEYP_EDR_MIS_RISING 0x80 + + +/*----------------------------------------------------------------------*/ + +static int twl4030_kpread(struct twl4030_keypad *kp, + u8 *data, u32 reg, u8 num_bytes) +{ + int ret = twl4030_i2c_read(TWL4030_MODULE_KEYPAD, data, reg, num_bytes); + + if (ret < 0) + dev_warn(kp->dbg_dev, + "Couldn't read TWL4030: %X - ret %d[%x]\n", + reg, ret, ret); + + return ret; +} + +static int twl4030_kpwrite_u8(struct twl4030_keypad *kp, u8 data, u32 reg) +{ + int ret = twl4030_i2c_write_u8(TWL4030_MODULE_KEYPAD, data, reg); + + if (ret < 0) + dev_warn(kp->dbg_dev, + "Could not write TWL4030: %X - ret %d[%x]\n", + reg, ret, ret); + + return ret; +} + +static inline u16 twl4030_col_xlate(struct twl4030_keypad *kp, u8 col) +{ + /* If all bits in a row are active for all coloumns then + * we have that row line connected to gnd. Mark this + * key on as if it was on matrix position n_cols (ie + * one higher than the size of the matrix). + */ + if (col == 0xFF) + return 1 << kp->n_cols; + else + return col & ((1 << kp->n_cols) - 1); +} + +static int twl4030_read_kp_matrix_state(struct twl4030_keypad *kp, u16 *state) +{ + u8 new_state[TWL4030_MAX_ROWS]; + int row; + int ret = twl4030_kpread(kp, new_state, + KEYP_FULL_CODE_7_0, kp->n_rows); + if (ret >= 0) + for (row = 0; row < kp->n_rows; row++) + state[row] = twl4030_col_xlate(kp, new_state[row]); + + return ret; +} + +static int twl4030_is_in_ghost_state(struct twl4030_keypad *kp, u16 *key_state) +{ + int i; + u16 check = 0; + + for (i = 0; i < kp->n_rows; i++) { + u16 col = key_state[i]; + + if ((col & check) && hweight16(col) > 1) + return 1; + + check |= col; + } + + return 0; +} + +static void twl4030_kp_scan(struct twl4030_keypad *kp, bool release_all) +{ + struct input_dev *input = kp->input; + u16 new_state[TWL4030_MAX_ROWS]; + int col, row; + + if (release_all) + memset(new_state, 0, sizeof(new_state)); + else { + /* check for any changes */ + int ret = twl4030_read_kp_matrix_state(kp, new_state); + + if (ret < 0) /* panic ... */ + return; + + if (twl4030_is_in_ghost_state(kp, new_state)) + return; + } + + /* check for changes and print those */ + for (row = 0; row < kp->n_rows; row++) { + int changed = new_state[row] ^ kp->kp_state[row]; + + if (!changed) + continue; + + for (col = 0; col < kp->n_cols; col++) { + int code; + + if (!(changed & (1 << col))) + continue; + + dev_dbg(kp->dbg_dev, "key [%d:%d] %s\n", row, col, + (new_state[row] & (1 << col)) ? + "press" : "release"); + + code = MATRIX_SCAN_CODE(row, col, TWL4030_ROW_SHIFT); + input_event(input, EV_MSC, MSC_SCAN, code); + input_report_key(input, kp->keymap[code], + new_state[row] & (1 << col)); + } + kp->kp_state[row] = new_state[row]; + } + input_sync(input); +} + +/* + * Keypad interrupt handler + */ +static irqreturn_t do_kp_irq(int irq, void *_kp) +{ + struct twl4030_keypad *kp = _kp; + u8 reg; + int ret; + +#ifdef CONFIG_LOCKDEP + /* WORKAROUND for lockdep forcing IRQF_DISABLED on us, which + * we don't want and can't tolerate. Although it might be + * friendlier not to borrow this thread context... + */ + local_irq_enable(); +#endif + + /* Read & Clear TWL4030 pending interrupt */ + ret = twl4030_kpread(kp, ®, KEYP_ISR1, 1); + + /* Release all keys if I2C has gone bad or + * the KEYP has gone to idle state */ + if (ret >= 0 && (reg & KEYP_IMR1_KP)) + twl4030_kp_scan(kp, false); + else + twl4030_kp_scan(kp, true); + + return IRQ_HANDLED; +} + +static int __devinit twl4030_kp_program(struct twl4030_keypad *kp) +{ + u8 reg; + int i; + + /* Enable controller, with hardware decoding but not autorepeat */ + reg = KEYP_CTRL_SOFT_NRST | KEYP_CTRL_SOFTMODEN + | KEYP_CTRL_TOE_EN | KEYP_CTRL_KBD_ON; + if (twl4030_kpwrite_u8(kp, reg, KEYP_CTRL) < 0) + return -EIO; + + /* NOTE: we could use sih_setup() here to package keypad + * event sources as four different IRQs ... but we don't. + */ + + /* Enable TO rising and KP rising and falling edge detection */ + reg = KEYP_EDR_KP_BOTH | KEYP_EDR_TO_RISING; + if (twl4030_kpwrite_u8(kp, reg, KEYP_EDR) < 0) + return -EIO; + + /* Set PTV prescaler Field */ + reg = (PTV_PRESCALER << KEYP_LK_PTV_PTV_SHIFT); + if (twl4030_kpwrite_u8(kp, reg, KEYP_LK_PTV) < 0) + return -EIO; + + /* Set key debounce time to 20 ms */ + i = KEYP_PERIOD_US(20000, PTV_PRESCALER); + if (twl4030_kpwrite_u8(kp, i, KEYP_DEB) < 0) + return -EIO; + + /* Set timeout period to 100 ms */ + i = KEYP_PERIOD_US(200000, PTV_PRESCALER); + if (twl4030_kpwrite_u8(kp, (i & 0xFF), KEYP_TIMEOUT_L) < 0) + return -EIO; + + if (twl4030_kpwrite_u8(kp, (i >> 8), KEYP_TIMEOUT_H) < 0) + return -EIO; + + /* + * Enable Clear-on-Read; disable remembering events that fire + * after the IRQ but before our handler acks (reads) them, + */ + reg = TWL4030_SIH_CTRL_COR_MASK | TWL4030_SIH_CTRL_PENDDIS_MASK; + if (twl4030_kpwrite_u8(kp, reg, KEYP_SIH_CTRL) < 0) + return -EIO; + + /* initialize key state; irqs update it from here on */ + if (twl4030_read_kp_matrix_state(kp, kp->kp_state) < 0) + return -EIO; + + return 0; +} + +/* + * Registers keypad device with input subsystem + * and configures TWL4030 keypad registers + */ +static int __devinit twl4030_kp_probe(struct platform_device *pdev) +{ + struct twl4030_keypad_data *pdata = pdev->dev.platform_data; + const struct matrix_keymap_data *keymap_data = pdata->keymap_data; + struct twl4030_keypad *kp; + struct input_dev *input; + u8 reg; + int error; + + if (!pdata || !pdata->rows || !pdata->cols || + pdata->rows > TWL4030_MAX_ROWS || pdata->cols > TWL4030_MAX_COLS) { + dev_err(&pdev->dev, "Invalid platform_data\n"); + return -EINVAL; + } + + kp = kzalloc(sizeof(*kp), GFP_KERNEL); + input = input_allocate_device(); + if (!kp || !input) { + error = -ENOMEM; + goto err1; + } + + /* Get the debug Device */ + kp->dbg_dev = &pdev->dev; + kp->input = input; + + kp->n_rows = pdata->rows; + kp->n_cols = pdata->cols; + kp->irq = platform_get_irq(pdev, 0); + + /* setup input device */ + __set_bit(EV_KEY, input->evbit); + + /* Enable auto repeat feature of Linux input subsystem */ + if (pdata->rep) + __set_bit(EV_REP, input->evbit); + + input_set_capability(input, EV_MSC, MSC_SCAN); + + input->name = "TWL4030 Keypad"; + input->phys = "twl4030_keypad/input0"; + input->dev.parent = &pdev->dev; + + input->id.bustype = BUS_HOST; + input->id.vendor = 0x0001; + input->id.product = 0x0001; + input->id.version = 0x0003; + + input->keycode = kp->keymap; + input->keycodesize = sizeof(kp->keymap[0]); + input->keycodemax = ARRAY_SIZE(kp->keymap); + + matrix_keypad_build_keymap(keymap_data, TWL4030_ROW_SHIFT, + input->keycode, input->keybit); + + error = input_register_device(input); + if (error) { + dev_err(kp->dbg_dev, + "Unable to register twl4030 keypad device\n"); + goto err1; + } + + error = twl4030_kp_program(kp); + if (error) + goto err2; + + /* + * This ISR will always execute in kernel thread context because of + * the need to access the TWL4030 over the I2C bus. + * + * NOTE: we assume this host is wired to TWL4040 INT1, not INT2 ... + */ + error = request_irq(kp->irq, do_kp_irq, 0, pdev->name, kp); + if (error) { + dev_info(kp->dbg_dev, "request_irq failed for irq no=%d\n", + kp->irq); + goto err3; + } + + /* Enable KP and TO interrupts now. */ + reg = (u8) ~(KEYP_IMR1_KP | KEYP_IMR1_TO); + if (twl4030_kpwrite_u8(kp, reg, KEYP_IMR1)) { + error = -EIO; + goto err4; + } + + platform_set_drvdata(pdev, kp); + return 0; + +err4: + /* mask all events - we don't care about the result */ + (void) twl4030_kpwrite_u8(kp, 0xff, KEYP_IMR1); +err3: + free_irq(kp->irq, NULL); +err2: + input_unregister_device(input); + input = NULL; +err1: + input_free_device(input); + kfree(kp); + return error; +} + +static int __devexit twl4030_kp_remove(struct platform_device *pdev) +{ + struct twl4030_keypad *kp = platform_get_drvdata(pdev); + + free_irq(kp->irq, kp); + input_unregister_device(kp->input); + platform_set_drvdata(pdev, NULL); + kfree(kp); + + return 0; +} + +/* + * NOTE: twl4030 are multi-function devices connected via I2C. + * So this device is a child of an I2C parent, thus it needs to + * support unplug/replug (which most platform devices don't). + */ + +static struct platform_driver twl4030_kp_driver = { + .probe = twl4030_kp_probe, + .remove = __devexit_p(twl4030_kp_remove), + .driver = { + .name = "twl4030_keypad", + .owner = THIS_MODULE, + }, +}; + +static int __init twl4030_kp_init(void) +{ + return platform_driver_register(&twl4030_kp_driver); +} +module_init(twl4030_kp_init); + +static void __exit twl4030_kp_exit(void) +{ + platform_driver_unregister(&twl4030_kp_driver); +} +module_exit(twl4030_kp_exit); + +MODULE_AUTHOR("Texas Instruments"); +MODULE_DESCRIPTION("TWL4030 Keypad Driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:twl4030_keypad"); + diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h index 0dc80ef24975..3fd21d7cb6bf 100644 --- a/include/linux/i2c/twl4030.h +++ b/include/linux/i2c/twl4030.h @@ -25,6 +25,9 @@ #ifndef __TWL4030_H_ #define __TWL4030_H_ +#include +#include + /* * Using the twl4030 core we address registers using a pair * { module id, relative register offset } @@ -302,13 +305,17 @@ struct twl4030_madc_platform_data { int irq_line; }; +/* Boards have uniqe mappings of {col, row} --> keycode. + * Column and row are 4 bits, but range only from 0..7. + * a PERSISTENT_KEY is "always on" and never reported. + */ +#define PERSISTENT_KEY(c, r) KEY((c), (r), KEY_RESERVED) + struct twl4030_keypad_data { - int rows; - int cols; - int *keymap; - int irq; - unsigned int keymapsize; - unsigned int rep:1; + const struct matrix_keymap_data *keymap_data; + unsigned rows; + unsigned cols; + bool rep; }; enum twl4030_usb_mode { -- cgit v1.2.3-71-gd317 From 468de9e54a900559b55aa939a4daeaea1915e572 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 27 Aug 2009 12:07:40 -0400 Subject: nfsd41: expand solo sequence check Compounds consisting of only a sequence operation don't need any additional caching beyond the sequence information we store in the slot entry. Fix nfsd4_is_solo_sequence to identify this case correctly. The additional check for a failed sequence in nfsd4_store_cache_entry() is redundant, since the nfsd4_is_solo_sequence call lower down catches this case. The final ce_cachethis set in nfsd4_sequence is also redundant. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 9 --------- include/linux/nfsd/xdr4.h | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5f634d24861c..b44a2cfde6f1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -991,16 +991,10 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; struct svc_rqst *rqstp = resp->rqstp; - struct nfsd4_compoundargs *args = rqstp->rq_argp; - struct nfsd4_op *op = &args->ops[resp->opcnt]; struct kvec *resv = &rqstp->rq_res.head[0]; dprintk("--> %s entry %p\n", __func__, entry); - /* Don't cache a failed OP_SEQUENCE. */ - if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status) - return; - nfsd4_release_respages(entry->ce_respages, entry->ce_resused); entry->ce_opcnt = resp->opcnt; entry->ce_status = resp->cstate.status; @@ -1490,9 +1484,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, slot->sl_inuse = true; slot->sl_seqid = seq->seqid; slot->sl_cache_entry.ce_cachethis = seq->cachethis; - /* Always set the cache entry cachethis for solo sequence */ - if (nfsd4_is_solo_sequence(resp)) - slot->sl_cache_entry.ce_cachethis = 1; cstate->slot = slot; cstate->session = session; diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h index 5e4beb0deb80..3f716607c86d 100644 --- a/include/linux/nfsd/xdr4.h +++ b/include/linux/nfsd/xdr4.h @@ -467,7 +467,7 @@ struct nfsd4_compoundres { static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) { struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; - return args->opcnt == 1; + return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; } static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) -- cgit v1.2.3-71-gd317 From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Aug 2009 20:25:24 +0200 Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable Martin Schwidefsky analyzed it: To register a clocksource the clocksource_mutex is acquired and if necessary timekeeping_notify is called to install the clocksource as the timekeeper clock. timekeeping_notify uses stop_machine which needs to take cpu_add_remove_lock mutex. Starting a new cpu is done with the cpu_add_remove_lock mutex held. native_cpu_up checks the tsc of the new cpu and if the tsc is no good clocksource_change_rating is called. Which needs the clocksource_mutex and the deadlock is complete. The solution is to replace the TSC via the clocksource watchdog mechanism. Mark the TSC as unstable and schedule the watchdog work so it gets removed in the watchdog thread context. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Martin Schwidefsky Cc: John Stultz --- arch/x86/kernel/tsc.c | 8 +++++--- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 968425422c46..fc3672a303d6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9ea40ff26f0e..83d2fbd81b93 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); +extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0c86ad6e9fb..a0af4ffcb6e5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) +static void __clocksource_unstable(struct clocksource *cs) { - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; schedule_work(&watchdog_work); } +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; -- cgit v1.2.3-71-gd317 From e55a5999ffcf72dc4d43d73618957964cb87065a Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 28 Jul 2009 17:41:53 +0800 Subject: ACPI: Handle CONFIG_ACPI=n better from linux/acpi.h linux/acpi.h is the top level header for interfacing with the ACPI sub-system, so acpi_disabled should be up there instead of down in asm/acpi.h -- particularly since asm/acpi.h doesn't exist for all architectures. Same story for acpi_table_parse(), which is a top-level API to Linux/ACPI. This is necessary for building some code that used to always depend on CONFIG_ACPI=y, but will soon also need to build with CONFIG_ACPI=n. Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/include/asm/acpi.h | 1 - include/linux/acpi.h | 11 ++++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465a2ab0..4518dc500903 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) #else /* !CONFIG_ACPI */ -#define acpi_disabled 1 #define acpi_lapic 0 #define acpi_ioapic 0 static inline void acpi_noirq_set(void) { } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 34321cfffeab..3fce811bf9ac 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -292,7 +292,10 @@ void __init acpi_s4_no_nvs(void); extern acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 flags); extern void acpi_early_init(void); -#else /* CONFIG_ACPI */ +#else /* !CONFIG_ACPI */ + +#define acpi_disabled 1 + static inline void acpi_early_init(void) { } static inline int early_acpi_boot_init(void) @@ -331,5 +334,11 @@ static inline int acpi_check_mem_region(resource_size_t start, return 0; } +struct acpi_table_header; +static inline int acpi_table_parse(char *id, + int (*handler)(struct acpi_table_header *)) +{ + return -1; +} #endif /* !CONFIG_ACPI */ #endif /*_LINUX_ACPI_H*/ -- cgit v1.2.3-71-gd317 From 117a9ac777f8034d4675b821172d2ff71f6ec47a Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 14 Aug 2009 15:10:24 -0400 Subject: SFI: create linux/sfi.h include/linux/include/sfi.h defines everything that customers of SFI need to know in order to use the SFI suport in the kernel. The primary API is sfi_table_parse(), where a driver or another part of the kernel can supply a handler to parse the named table. sfi.h also includes the currently defined table signatures and table formats. Signed-off-by: Feng Tang Signed-off-by: Len Brown --- include/linux/sfi.h | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 include/linux/sfi.h (limited to 'include/linux') diff --git a/include/linux/sfi.h b/include/linux/sfi.h new file mode 100644 index 000000000000..9a6f7607174e --- /dev/null +++ b/include/linux/sfi.h @@ -0,0 +1,206 @@ +/* sfi.h Simple Firmware Interface */ + +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + The full GNU General Public License is included in this distribution + in the file called LICENSE.GPL. + + BSD LICENSE + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _LINUX_SFI_H +#define _LINUX_SFI_H + +/* Table signatures reserved by the SFI specification */ +#define SFI_SIG_SYST "SYST" +#define SFI_SIG_FREQ "FREQ" +#define SFI_SIG_IDLE "IDLE" +#define SFI_SIG_CPUS "CPUS" +#define SFI_SIG_MTMR "MTMR" +#define SFI_SIG_MRTC "MRTC" +#define SFI_SIG_MMAP "MMAP" +#define SFI_SIG_APIC "APIC" +#define SFI_SIG_XSDT "XSDT" +#define SFI_SIG_WAKE "WAKE" +#define SFI_SIG_SPIB "SPIB" +#define SFI_SIG_I2CB "I2CB" +#define SFI_SIG_GPEM "GPEM" + +#define SFI_SIGNATURE_SIZE 4 +#define SFI_OEM_ID_SIZE 6 +#define SFI_OEM_TABLE_ID_SIZE 8 + +#define SFI_SYST_SEARCH_BEGIN 0x000E0000 +#define SFI_SYST_SEARCH_END 0x000FFFFF + +#define SFI_GET_NUM_ENTRIES(ptable, entry_type) \ + ((ptable->header.len - sizeof(struct sfi_table_header)) / \ + (sizeof(entry_type))) +/* + * Table structures must be byte-packed to match the SFI specification, + * as they are provided by the BIOS. + */ +struct sfi_table_header { + char sig[SFI_SIGNATURE_SIZE]; + u32 len; + u8 rev; + u8 csum; + char oem_id[SFI_OEM_ID_SIZE]; + char oem_table_id[SFI_OEM_TABLE_ID_SIZE]; +} __packed; + +struct sfi_table_simple { + struct sfi_table_header header; + u64 pentry[1]; +} __packed; + +/* Comply with UEFI spec 2.1 */ +struct sfi_mem_entry { + u32 type; + u64 phys_start; + u64 virt_start; + u64 pages; + u64 attrib; +} __packed; + +struct sfi_cpu_table_entry { + u32 apic_id; +} __packed; + +struct sfi_cstate_table_entry { + u32 hint; /* MWAIT hint */ + u32 latency; /* latency in ms */ +} __packed; + +struct sfi_apic_table_entry { + u64 phys_addr; /* phy base addr for APIC reg */ +} __packed; + +struct sfi_freq_table_entry { + u32 freq_mhz; /* in MHZ */ + u32 latency; /* transition latency in ms */ + u32 ctrl_val; /* value to write to PERF_CTL */ +} __packed; + +struct sfi_wake_table_entry { + u64 phys_addr; /* pointer to where the wake vector locates */ +} __packed; + +struct sfi_timer_table_entry { + u64 phys_addr; /* phy base addr for the timer */ + u32 freq_hz; /* in HZ */ + u32 irq; +} __packed; + +struct sfi_rtc_table_entry { + u64 phys_addr; /* phy base addr for the RTC */ + u32 irq; +} __packed; + +struct sfi_spi_table_entry { + u16 host_num; /* attached to host 0, 1...*/ + u16 cs; /* chip select */ + u16 irq_info; + char name[16]; + u8 dev_info[10]; +} __packed; + +struct sfi_i2c_table_entry { + u16 host_num; + u16 addr; /* slave addr */ + u16 irq_info; + char name[16]; + u8 dev_info[10]; +} __packed; + +struct sfi_gpe_table_entry { + u16 logical_id; /* logical id */ + u16 phys_id; /* physical GPE id */ +} __packed; + + +typedef int (*sfi_table_handler) (struct sfi_table_header *table); + +#ifdef CONFIG_SFI +extern void __init sfi_init(void); +extern int __init sfi_platform_init(void); +extern void __init sfi_init_late(void); +extern int sfi_table_parse(char *signature, char *oem_id, char *oem_table_id, + sfi_table_handler handler); + +extern int sfi_disabled; +static inline void disable_sfi(void) +{ + sfi_disabled = 1; +} + +#else /* !CONFIG_SFI */ + +static inline void sfi_init(void) +{ +} + +static inline void sfi_init_late(void) +{ +} + +#define sfi_disabled 0 + +static inline int sfi_table_parse(char *signature, char *oem_id, + char *oem_table_id, + sfi_table_handler handler) +{ + return -1; +} + +#endif /* !CONFIG_SFI */ + +#endif /*_LINUX_SFI_H*/ -- cgit v1.2.3-71-gd317 From 13e82d023c4c3f13ab1e665cbb917a7ebba8935c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 14 Aug 2009 15:17:53 -0400 Subject: SFI: add capability to parse ACPI tables Extend SFI to access standard ACPI tables. (eg. the PCI MCFG) using sfi_acpi_table_parse(). Note that this is _not_ a hybrid ACPI + SFI mode. The platform boots in either ACPI mode or SFI mode. SFI runs only with acpi_disabled=1, which can be set at build-time via CONFIG_ACPI=n, or at boot time by the failure to find ACPI platform support. So this extension simply allows SFI-platforms to re-use existing standard table formats that happen to be defined to live in ACPI envelopes. Signed-off-by: Feng Tang Signed-off-by: Len Brown --- drivers/sfi/sfi_acpi.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sfi_acpi.h | 93 +++++++++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 drivers/sfi/sfi_acpi.c create mode 100644 include/linux/sfi_acpi.h (limited to 'include/linux') diff --git a/drivers/sfi/sfi_acpi.c b/drivers/sfi/sfi_acpi.c new file mode 100644 index 000000000000..34aba30eb84b --- /dev/null +++ b/drivers/sfi/sfi_acpi.c @@ -0,0 +1,175 @@ +/* sfi_acpi.c Simple Firmware Interface - ACPI extensions */ + +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + The full GNU General Public License is included in this distribution + in the file called LICENSE.GPL. + + BSD LICENSE + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include +#include "sfi_core.h" + +/* + * SFI can access ACPI-defined tables via an optional ACPI XSDT. + * + * This allows re-use, and avoids re-definition, of standard tables. + * For example, the "MCFG" table is defined by PCI, reserved by ACPI, + * and is expected to be present many SFI-only systems. + */ + +static struct acpi_table_xsdt *xsdt_va __read_mostly; + +#define XSDT_GET_NUM_ENTRIES(ptable, entry_type) \ + ((ptable->header.length - sizeof(struct acpi_table_header)) / \ + (sizeof(entry_type))) + +static inline struct sfi_table_header *acpi_to_sfi_th( + struct acpi_table_header *th) +{ + return (struct sfi_table_header *)th; +} + +static inline struct acpi_table_header *sfi_to_acpi_th( + struct sfi_table_header *th) +{ + return (struct acpi_table_header *)th; +} + +/* + * sfi_acpi_parse_xsdt() + * + * Parse the ACPI XSDT for later access by sfi_acpi_table_parse(). + */ +static int __init sfi_acpi_parse_xsdt(struct sfi_table_header *th) +{ + struct sfi_table_key key = SFI_ANY_KEY; + int tbl_cnt, i; + void *ret; + + xsdt_va = (struct acpi_table_xsdt *)th; + tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64); + for (i = 0; i < tbl_cnt; i++) { + ret = sfi_check_table(xsdt_va->table_offset_entry[i], &key); + if (IS_ERR(ret)) { + disable_sfi(); + return -1; + } + } + + return 0; +} + +int __init sfi_acpi_init(void) +{ + struct sfi_table_key xsdt_key = { .sig = SFI_SIG_XSDT }; + + sfi_table_parse(SFI_SIG_XSDT, NULL, NULL, sfi_acpi_parse_xsdt); + + /* Only call the get_table to keep the table mapped */ + xsdt_va = (struct acpi_table_xsdt *)sfi_get_table(&xsdt_key); + return 0; +} + +static struct acpi_table_header *sfi_acpi_get_table(struct sfi_table_key *key) +{ + u32 tbl_cnt, i; + void *ret; + + tbl_cnt = XSDT_GET_NUM_ENTRIES(xsdt_va, u64); + for (i = 0; i < tbl_cnt; i++) { + ret = sfi_check_table(xsdt_va->table_offset_entry[i], key); + if (!IS_ERR(ret) && ret) + return sfi_to_acpi_th(ret); + } + + return NULL; +} + +static void sfi_acpi_put_table(struct acpi_table_header *table) +{ + sfi_put_table(acpi_to_sfi_th(table)); +} + +/* + * sfi_acpi_table_parse() + * + * Find specified table in XSDT, run handler on it and return its return value + */ +int sfi_acpi_table_parse(char *signature, char *oem_id, char *oem_table_id, + int(*handler)(struct acpi_table_header *)) +{ + struct acpi_table_header *table = NULL; + struct sfi_table_key key; + int ret = 0; + + if (sfi_disabled) + return -1; + + key.sig = signature; + key.oem_id = oem_id; + key.oem_table_id = oem_table_id; + + table = sfi_acpi_get_table(&key); + if (!table) + return -EINVAL; + + ret = handler(table); + sfi_acpi_put_table(table); + return ret; +} diff --git a/include/linux/sfi_acpi.h b/include/linux/sfi_acpi.h new file mode 100644 index 000000000000..c4a5a8cd4469 --- /dev/null +++ b/include/linux/sfi_acpi.h @@ -0,0 +1,93 @@ +/* sfi.h Simple Firmware Interface */ + +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + The full GNU General Public License is included in this distribution + in the file called LICENSE.GPL. + + BSD LICENSE + + Copyright(c) 2009 Intel Corporation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _LINUX_SFI_ACPI_H +#define _LINUX_SFI_ACPI_H + +#ifdef CONFIG_SFI +#include /* struct acpi_table_header */ + +extern int sfi_acpi_table_parse(char *signature, char *oem_id, + char *oem_table_id, + int (*handler)(struct acpi_table_header *)); + +static inline int acpi_sfi_table_parse(char *signature, + int (*handler)(struct acpi_table_header *)) +{ + if (!acpi_table_parse(signature, handler)) + return 0; + + return sfi_acpi_table_parse(signature, NULL, NULL, handler); +} +#else /* !CONFIG_SFI */ + +static inline int sfi_acpi_table_parse(char *signature, char *oem_id, + char *oem_table_id, + int (*handler)(struct acpi_table_header *)) +{ + return -1; +} + +static inline int acpi_sfi_table_parse(char *signature, + int (*handler)(struct acpi_table_header *)) +{ + return acpi_table_parse(signature, handler); +} +#endif /* !CONFIG_SFI */ + +#endif /*_LINUX_SFI_ACPI_H*/ -- cgit v1.2.3-71-gd317 From ad283ea4a3ce82cda2efe33163748a397b31b1eb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 29 Aug 2009 19:09:26 -0700 Subject: async_tx: add sum check flags Replace the flat zero_sum_result with a collection of flags to contain the P (xor) zero-sum result, and the soon to be utilized Q (raid6 reed solomon syndrome) zero-sum result. Use the SUM_CHECK_ namespace instead of DMA_ since these flags will be used on non-dma-zero-sum enabled platforms. Reviewed-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- arch/arm/include/asm/hardware/iop3xx-adma.h | 5 +++-- arch/arm/mach-iop13xx/include/mach/adma.h | 12 +++++++----- crypto/async_tx/async_xor.c | 4 ++-- drivers/md/raid5.c | 2 +- drivers/md/raid5.h | 5 +++-- include/linux/async_tx.h | 2 +- include/linux/dmaengine.h | 21 ++++++++++++++++++++- 7 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h index 83e6ba338e2c..26eefea02314 100644 --- a/arch/arm/include/asm/hardware/iop3xx-adma.h +++ b/arch/arm/include/asm/hardware/iop3xx-adma.h @@ -756,13 +756,14 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, hw_desc->src[0] = val; } -static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +static inline enum sum_check_flags +iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) { struct iop3xx_desc_aau *hw_desc = desc->hw_desc; struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; iop_paranoia(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en)); - return desc_ctrl.zero_result_err; + return desc_ctrl.zero_result_err << SUM_CHECK_P; } static inline void iop_chan_append(struct iop_adma_chan *chan) diff --git a/arch/arm/mach-iop13xx/include/mach/adma.h b/arch/arm/mach-iop13xx/include/mach/adma.h index 5722e86f2174..1cd31df8924d 100644 --- a/arch/arm/mach-iop13xx/include/mach/adma.h +++ b/arch/arm/mach-iop13xx/include/mach/adma.h @@ -428,18 +428,20 @@ static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, hw_desc->block_fill_data = val; } -static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +static inline enum sum_check_flags +iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) { struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field; + enum sum_check_flags flags; BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result)); - if (desc_ctrl.pq_xfer_en) - return byte_count.zero_result_err_q; - else - return byte_count.zero_result_err; + flags = byte_count.zero_result_err_q << SUM_CHECK_Q; + flags |= byte_count.zero_result_err << SUM_CHECK_P; + + return flags; } static inline void iop_chan_append(struct iop_adma_chan *chan) diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 1e96c4df7061..78fb7780272a 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -246,7 +246,7 @@ static int page_is_zero(struct page *p, unsigned int offset, size_t len) */ struct dma_async_tx_descriptor * async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, u32 *result, + int src_cnt, size_t len, enum sum_check_flags *result, struct async_submit_ctl *submit) { struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR_VAL, @@ -304,7 +304,7 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, async_tx_quiesce(&tx); - *result = page_is_zero(dest, offset, len) ? 0 : 1; + *result = !page_is_zero(dest, offset, len) << SUM_CHECK_P; async_tx_sync_epilog(submit); submit->flags = flags_orig; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7727954cf726..1f2a266f3cf7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2590,7 +2590,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, * we are done. Otherwise update the mismatch count and repair * parity if !MD_RECOVERY_CHECK */ - if (sh->ops.zero_sum_result == 0) + if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) /* parity is correct (on disc, * not in buffer any more) */ diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e7baabffee86..75f2c6c4cf90 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -2,6 +2,7 @@ #define _RAID5_H #include +#include /* * @@ -215,8 +216,8 @@ struct stripe_head { * @target - STRIPE_OP_COMPUTE_BLK target */ struct stripe_operations { - int target; - u32 zero_sum_result; + int target; + enum sum_check_flags zero_sum_result; } ops; struct r5dev { struct bio req; diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 00cfb637ddf2..3d21a2517518 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -148,7 +148,7 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, struct dma_async_tx_descriptor * async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, u32 *result, + int src_cnt, size_t len, enum sum_check_flags *result, struct async_submit_ctl *submit); struct dma_async_tx_descriptor * diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 6768727d00d7..02447afcebad 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -86,6 +86,25 @@ enum dma_ctrl_flags { DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), }; +/** + * enum sum_check_bits - bit position of pq_check_flags + */ +enum sum_check_bits { + SUM_CHECK_P = 0, + SUM_CHECK_Q = 1, +}; + +/** + * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise + * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise + */ +enum sum_check_flags { + SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P), + SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q), +}; + + /** * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. * See linux/cpumask.h @@ -245,7 +264,7 @@ struct dma_device { unsigned int src_cnt, size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, - size_t len, u32 *result, unsigned long flags); + size_t len, enum sum_check_flags *result, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags); -- cgit v1.2.3-71-gd317 From 95475e57113c66aac7583925736ed2e2d58c990d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 Jul 2009 12:19:02 -0700 Subject: async_tx: remove walk of tx->parent chain in dma_wait_for_async_tx We currently walk the parent chain when waiting for a given tx to complete however this walk may race with the driver cleanup routine. The routines in async_raid6_recov.c may fall back to the synchronous path at any point so we need to be prepared to call async_tx_quiesce() (which calls dma_wait_for_async_tx). To remove the ->parent walk we guarantee that every time a dependency is attached ->issue_pending() is invoked, then we can simply poll the initial descriptor until completion. This also allows for a lighter weight 'issue pending' implementation as there is no longer a requirement to iterate through all the channels' ->issue_pending() routines as long as operations have been submitted in an ordered chain. async_tx_issue_pending() is added for this case. Signed-off-by: Dan Williams --- crypto/async_tx/async_tx.c | 13 +++++++------ drivers/dma/dmaengine.c | 45 ++++++++++----------------------------------- include/linux/async_tx.h | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index 6e37ad3f4417..60615fedcf5e 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -77,8 +77,8 @@ static void async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, struct dma_async_tx_descriptor *tx) { - struct dma_chan *chan; - struct dma_device *device; + struct dma_chan *chan = depend_tx->chan; + struct dma_device *device = chan->device; struct dma_async_tx_descriptor *intr_tx = (void *) ~0; /* first check to see if we can still append to depend_tx */ @@ -90,11 +90,11 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, } spin_unlock_bh(&depend_tx->lock); - if (!intr_tx) + /* attached dependency, flush the parent channel */ + if (!intr_tx) { + device->device_issue_pending(chan); return; - - chan = depend_tx->chan; - device = chan->device; + } /* see if we can schedule an interrupt * otherwise poll for completion @@ -128,6 +128,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, intr_tx->tx_submit(intr_tx); async_tx_ack(intr_tx); } + device->device_issue_pending(chan); } else { if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) panic("%s: DMA_ERROR waiting for depend_tx\n", diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 6781e8f3c064..e002e0e0d055 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -934,49 +934,24 @@ EXPORT_SYMBOL(dma_async_tx_descriptor_init); /* dma_wait_for_async_tx - spin wait for a transaction to complete * @tx: in-flight transaction to wait on - * - * This routine assumes that tx was obtained from a call to async_memcpy, - * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped - * and submitted). Walking the parent chain is only meant to cover for DMA - * drivers that do not implement the DMA_INTERRUPT capability and may race with - * the driver's descriptor cleanup routine. */ enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { - enum dma_status status; - struct dma_async_tx_descriptor *iter; - struct dma_async_tx_descriptor *parent; + unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); if (!tx) return DMA_SUCCESS; - WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" - " %s\n", __func__, dma_chan_name(tx->chan)); - - /* poll through the dependency chain, return when tx is complete */ - do { - iter = tx; - - /* find the root of the unsubmitted dependency chain */ - do { - parent = iter->parent; - if (!parent) - break; - else - iter = parent; - } while (parent); - - /* there is a small window for ->parent == NULL and - * ->cookie == -EBUSY - */ - while (iter->cookie == -EBUSY) - cpu_relax(); - - status = dma_sync_wait(iter->chan, iter->cookie); - } while (status == DMA_IN_PROGRESS || (iter != tx)); - - return status; + while (tx->cookie == -EBUSY) { + if (time_after_eq(jiffies, dma_sync_wait_timeout)) { + pr_err("%s timeout waiting for descriptor submission\n", + __func__); + return DMA_ERROR; + } + cpu_relax(); + } + return dma_sync_wait(tx->chan, tx->cookie); } EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 3d21a2517518..12a2efcbd565 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -83,6 +83,24 @@ struct async_submit_ctl { #ifdef CONFIG_DMA_ENGINE #define async_tx_issue_pending_all dma_issue_pending_all + +/** + * async_tx_issue_pending - send pending descriptor to the hardware channel + * @tx: descriptor handle to retrieve hardware context + * + * Note: any dependent operations will have already been issued by + * async_tx_channel_switch, or (in the case of no channel switch) will + * be already pending on this channel. + */ +static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx) +{ + if (likely(tx)) { + struct dma_chan *chan = tx->chan; + struct dma_device *dma = chan->device; + + dma->device_issue_pending(chan); + } +} #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL #include #else @@ -98,6 +116,11 @@ static inline void async_tx_issue_pending_all(void) do { } while (0); } +static inline void async_tx_issue_pending(struct dma_async_tx_descriptor *tx) +{ + do { } while (0); +} + static inline struct dma_chan * async_tx_find_channel(struct async_submit_ctl *submit, enum dma_transaction_type tx_type, struct page **dst, -- cgit v1.2.3-71-gd317 From b2f46fd8ef3dff2ab30f31126833f78b7480283a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 Jul 2009 12:20:36 -0700 Subject: async_tx: add support for asynchronous GF multiplication [ Based on an original patch by Yuri Tikhonov ] This adds support for doing asynchronous GF multiplication by adding two additional functions to the async_tx API: async_gen_syndrome() does simultaneous XOR and Galois field multiplication of sources. async_syndrome_val() validates the given source buffers against known P and Q values. When a request is made to run async_pq against more than the hardware maximum number of supported sources we need to reuse the previous generated P and Q values as sources into the next operation. Care must be taken to remove Q from P' and P from Q'. For example to perform a 5 source pq op with hardware that only supports 4 sources at a time the following approach is taken: p, q = PQ(src0, src1, src2, src3, COEF({01}, {02}, {04}, {08})) p', q' = PQ(p, q, q, src4, COEF({00}, {01}, {00}, {10})) p' = p + q + q + src4 = p + src4 q' = {00}*p + {01}*q + {00}*q + {10}*src4 = q + {10}*src4 Note: 4 is the minimum acceptable maxpq otherwise we punt to synchronous-software path. The DMA_PREP_CONTINUE flag indicates to the driver to reuse p and q as sources (in the above manner) and fill the remaining slots up to maxpq with the new sources/coefficients. Note1: Some devices have native support for P+Q continuation and can skip this extra work. Devices with this capability can advertise it with dma_set_maxpq. It is up to each driver how to handle the DMA_PREP_CONTINUE flag. Note2: The api supports disabling the generation of P when generating Q, this is ignored by the synchronous path but is implemented by some dma devices to save unnecessary writes. In this case the continuation algorithm is simplified to only reuse Q as a source. Cc: H. Peter Anvin Cc: David Woodhouse Signed-off-by: Yuri Tikhonov Signed-off-by: Ilya Yanok Reviewed-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- Documentation/crypto/async-tx-api.txt | 3 + arch/arm/mach-iop13xx/setup.c | 2 +- crypto/async_tx/Kconfig | 4 + crypto/async_tx/Makefile | 1 + crypto/async_tx/async_pq.c | 388 ++++++++++++++++++++++++++++++++++ crypto/async_tx/async_xor.c | 2 +- drivers/dma/dmaengine.c | 4 + drivers/dma/iop-adma.c | 2 +- include/linux/async_tx.h | 9 + include/linux/dmaengine.h | 87 +++++++- 10 files changed, 493 insertions(+), 9 deletions(-) create mode 100644 crypto/async_tx/async_pq.c (limited to 'include/linux') diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index 6b15e488c0e7..0e48e054d69a 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -64,6 +64,9 @@ xor - xor a series of source buffers and write the result to a xor_val - xor a series of source buffers and set a flag if the result is zero. The implementation attempts to prevent writes to memory +pq - generate the p+q (raid6 syndrome) from a series of source buffers +pq_val - validate that a p and or q buffer are in sync with a given series of + sources 3.3 Descriptor management: The return value is non-NULL and points to a 'descriptor' when the operation diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c index 9800228b71d3..2e7ca0d75f8a 100644 --- a/arch/arm/mach-iop13xx/setup.c +++ b/arch/arm/mach-iop13xx/setup.c @@ -506,7 +506,7 @@ void __init iop13xx_platform_init(void) dma_cap_set(DMA_MEMSET, plat_data->cap_mask); dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); - dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask); + dma_cap_set(DMA_PQ, plat_data->cap_mask); dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask); dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask); break; diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig index d8fb39145986..cb6d7314f198 100644 --- a/crypto/async_tx/Kconfig +++ b/crypto/async_tx/Kconfig @@ -14,3 +14,7 @@ config ASYNC_MEMSET tristate select ASYNC_CORE +config ASYNC_PQ + tristate + select ASYNC_CORE + diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile index 27baa7d52fbc..1b9926588259 100644 --- a/crypto/async_tx/Makefile +++ b/crypto/async_tx/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_ASYNC_CORE) += async_tx.o obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o obj-$(CONFIG_ASYNC_XOR) += async_xor.o +obj-$(CONFIG_ASYNC_PQ) += async_pq.o diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c new file mode 100644 index 000000000000..108b21efb499 --- /dev/null +++ b/crypto/async_tx/async_pq.c @@ -0,0 +1,388 @@ +/* + * Copyright(c) 2007 Yuri Tikhonov + * Copyright(c) 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#include +#include +#include +#include +#include + +/** + * scribble - space to hold throwaway P buffer for synchronous gen_syndrome + */ +static struct page *scribble; + +static bool is_raid6_zero_block(struct page *p) +{ + return p == (void *) raid6_empty_zero_page; +} + +/* the struct page *blocks[] parameter passed to async_gen_syndrome() + * and async_syndrome_val() contains the 'P' destination address at + * blocks[disks-2] and the 'Q' destination address at blocks[disks-1] + * + * note: these are macros as they are used as lvalues + */ +#define P(b, d) (b[d-2]) +#define Q(b, d) (b[d-1]) + +/** + * do_async_gen_syndrome - asynchronously calculate P and/or Q + */ +static __async_inline struct dma_async_tx_descriptor * +do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks, + const unsigned char *scfs, unsigned int offset, int disks, + size_t len, dma_addr_t *dma_src, + struct async_submit_ctl *submit) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct dma_device *dma = chan->device; + enum dma_ctrl_flags dma_flags = 0; + enum async_tx_flags flags_orig = submit->flags; + dma_async_tx_callback cb_fn_orig = submit->cb_fn; + dma_async_tx_callback cb_param_orig = submit->cb_param; + int src_cnt = disks - 2; + unsigned char coefs[src_cnt]; + unsigned short pq_src_cnt; + dma_addr_t dma_dest[2]; + int src_off = 0; + int idx; + int i; + + /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ + if (P(blocks, disks)) + dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset, + len, DMA_BIDIRECTIONAL); + else + dma_flags |= DMA_PREP_PQ_DISABLE_P; + if (Q(blocks, disks)) + dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset, + len, DMA_BIDIRECTIONAL); + else + dma_flags |= DMA_PREP_PQ_DISABLE_Q; + + /* convert source addresses being careful to collapse 'empty' + * sources and update the coefficients accordingly + */ + for (i = 0, idx = 0; i < src_cnt; i++) { + if (is_raid6_zero_block(blocks[i])) + continue; + dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len, + DMA_TO_DEVICE); + coefs[idx] = scfs[i]; + idx++; + } + src_cnt = idx; + + while (src_cnt > 0) { + submit->flags = flags_orig; + pq_src_cnt = min(src_cnt, dma_maxpq(dma, dma_flags)); + /* if we are submitting additional pqs, leave the chain open, + * clear the callback parameters, and leave the destination + * buffers mapped + */ + if (src_cnt > pq_src_cnt) { + submit->flags &= ~ASYNC_TX_ACK; + dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; + submit->cb_fn = NULL; + submit->cb_param = NULL; + } else { + dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP; + submit->cb_fn = cb_fn_orig; + submit->cb_param = cb_param_orig; + if (cb_fn_orig) + dma_flags |= DMA_PREP_INTERRUPT; + } + + /* Since we have clobbered the src_list we are committed + * to doing this asynchronously. Drivers force forward + * progress in case they can not provide a descriptor + */ + for (;;) { + tx = dma->device_prep_dma_pq(chan, dma_dest, + &dma_src[src_off], + pq_src_cnt, + &coefs[src_off], len, + dma_flags); + if (likely(tx)) + break; + async_tx_quiesce(&submit->depend_tx); + dma_async_issue_pending(chan); + } + + async_tx_submit(chan, tx, submit); + submit->depend_tx = tx; + + /* drop completed sources */ + src_cnt -= pq_src_cnt; + src_off += pq_src_cnt; + + dma_flags |= DMA_PREP_CONTINUE; + } + + return tx; +} + +/** + * do_sync_gen_syndrome - synchronously calculate a raid6 syndrome + */ +static void +do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, + size_t len, struct async_submit_ctl *submit) +{ + void **srcs; + int i; + + if (submit->scribble) + srcs = submit->scribble; + else + srcs = (void **) blocks; + + for (i = 0; i < disks; i++) { + if (is_raid6_zero_block(blocks[i])) { + BUG_ON(i > disks - 3); /* P or Q can't be zero */ + srcs[i] = blocks[i]; + } else + srcs[i] = page_address(blocks[i]) + offset; + } + raid6_call.gen_syndrome(disks, len, srcs); + async_tx_sync_epilog(submit); +} + +/** + * async_gen_syndrome - asynchronously calculate a raid6 syndrome + * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1 + * @offset: common offset into each block (src and dest) to start transaction + * @disks: number of blocks (including missing P or Q, see below) + * @len: length of operation in bytes + * @submit: submission/completion modifiers + * + * General note: This routine assumes a field of GF(2^8) with a + * primitive polynomial of 0x11d and a generator of {02}. + * + * 'disks' note: callers can optionally omit either P or Q (but not + * both) from the calculation by setting blocks[disks-2] or + * blocks[disks-1] to NULL. When P or Q is omitted 'len' must be <= + * PAGE_SIZE as a temporary buffer of this size is used in the + * synchronous path. 'disks' always accounts for both destination + * buffers. + * + * 'blocks' note: if submit->scribble is NULL then the contents of + * 'blocks' may be overridden + */ +struct dma_async_tx_descriptor * +async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, + size_t len, struct async_submit_ctl *submit) +{ + int src_cnt = disks - 2; + struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, + &P(blocks, disks), 2, + blocks, src_cnt, len); + struct dma_device *device = chan ? chan->device : NULL; + dma_addr_t *dma_src = NULL; + + BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks))); + + if (submit->scribble) + dma_src = submit->scribble; + else if (sizeof(dma_addr_t) <= sizeof(struct page *)) + dma_src = (dma_addr_t *) blocks; + + if (dma_src && device && + (src_cnt <= dma_maxpq(device, 0) || + dma_maxpq(device, DMA_PREP_CONTINUE) > 0)) { + /* run the p+q asynchronously */ + pr_debug("%s: (async) disks: %d len: %zu\n", + __func__, disks, len); + return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset, + disks, len, dma_src, submit); + } + + /* run the pq synchronously */ + pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len); + + /* wait for any prerequisite operations */ + async_tx_quiesce(&submit->depend_tx); + + if (!P(blocks, disks)) { + P(blocks, disks) = scribble; + BUG_ON(len + offset > PAGE_SIZE); + } + if (!Q(blocks, disks)) { + Q(blocks, disks) = scribble; + BUG_ON(len + offset > PAGE_SIZE); + } + do_sync_gen_syndrome(blocks, offset, disks, len, submit); + + return NULL; +} +EXPORT_SYMBOL_GPL(async_gen_syndrome); + +/** + * async_syndrome_val - asynchronously validate a raid6 syndrome + * @blocks: source blocks from idx 0..disks-3, P @ disks-2 and Q @ disks-1 + * @offset: common offset into each block (src and dest) to start transaction + * @disks: number of blocks (including missing P or Q, see below) + * @len: length of operation in bytes + * @pqres: on val failure SUM_CHECK_P_RESULT and/or SUM_CHECK_Q_RESULT are set + * @spare: temporary result buffer for the synchronous case + * @submit: submission / completion modifiers + * + * The same notes from async_gen_syndrome apply to the 'blocks', + * and 'disks' parameters of this routine. The synchronous path + * requires a temporary result buffer and submit->scribble to be + * specified. + */ +struct dma_async_tx_descriptor * +async_syndrome_val(struct page **blocks, unsigned int offset, int disks, + size_t len, enum sum_check_flags *pqres, struct page *spare, + struct async_submit_ctl *submit) +{ + struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ_VAL, + NULL, 0, blocks, disks, + len); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx; + enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; + dma_addr_t *dma_src = NULL; + + BUG_ON(disks < 4); + + if (submit->scribble) + dma_src = submit->scribble; + else if (sizeof(dma_addr_t) <= sizeof(struct page *)) + dma_src = (dma_addr_t *) blocks; + + if (dma_src && device && disks <= dma_maxpq(device, 0)) { + struct device *dev = device->dev; + dma_addr_t *pq = &dma_src[disks-2]; + int i; + + pr_debug("%s: (async) disks: %d len: %zu\n", + __func__, disks, len); + if (!P(blocks, disks)) + dma_flags |= DMA_PREP_PQ_DISABLE_P; + if (!Q(blocks, disks)) + dma_flags |= DMA_PREP_PQ_DISABLE_Q; + for (i = 0; i < disks; i++) + if (likely(blocks[i])) { + BUG_ON(is_raid6_zero_block(blocks[i])); + dma_src[i] = dma_map_page(dev, blocks[i], + offset, len, + DMA_TO_DEVICE); + } + + for (;;) { + tx = device->device_prep_dma_pq_val(chan, pq, dma_src, + disks - 2, + raid6_gfexp, + len, pqres, + dma_flags); + if (likely(tx)) + break; + async_tx_quiesce(&submit->depend_tx); + dma_async_issue_pending(chan); + } + async_tx_submit(chan, tx, submit); + + return tx; + } else { + struct page *p_src = P(blocks, disks); + struct page *q_src = Q(blocks, disks); + enum async_tx_flags flags_orig = submit->flags; + dma_async_tx_callback cb_fn_orig = submit->cb_fn; + void *scribble = submit->scribble; + void *cb_param_orig = submit->cb_param; + void *p, *q, *s; + + pr_debug("%s: (sync) disks: %d len: %zu\n", + __func__, disks, len); + + /* caller must provide a temporary result buffer and + * allow the input parameters to be preserved + */ + BUG_ON(!spare || !scribble); + + /* wait for any prerequisite operations */ + async_tx_quiesce(&submit->depend_tx); + + /* recompute p and/or q into the temporary buffer and then + * check to see the result matches the current value + */ + tx = NULL; + *pqres = 0; + if (p_src) { + init_async_submit(submit, ASYNC_TX_XOR_ZERO_DST, NULL, + NULL, NULL, scribble); + tx = async_xor(spare, blocks, offset, disks-2, len, submit); + async_tx_quiesce(&tx); + p = page_address(p_src) + offset; + s = page_address(spare) + offset; + *pqres |= !!memcmp(p, s, len) << SUM_CHECK_P; + } + + if (q_src) { + P(blocks, disks) = NULL; + Q(blocks, disks) = spare; + init_async_submit(submit, 0, NULL, NULL, NULL, scribble); + tx = async_gen_syndrome(blocks, offset, disks, len, submit); + async_tx_quiesce(&tx); + q = page_address(q_src) + offset; + s = page_address(spare) + offset; + *pqres |= !!memcmp(q, s, len) << SUM_CHECK_Q; + } + + /* restore P, Q and submit */ + P(blocks, disks) = p_src; + Q(blocks, disks) = q_src; + + submit->cb_fn = cb_fn_orig; + submit->cb_param = cb_param_orig; + submit->flags = flags_orig; + async_tx_sync_epilog(submit); + + return NULL; + } +} +EXPORT_SYMBOL_GPL(async_syndrome_val); + +static int __init async_pq_init(void) +{ + scribble = alloc_page(GFP_KERNEL); + + if (scribble) + return 0; + + pr_err("%s: failed to allocate required spare page\n", __func__); + + return -ENOMEM; +} + +static void __exit async_pq_exit(void) +{ + put_page(scribble); +} + +module_init(async_pq_init); +module_exit(async_pq_exit); + +MODULE_DESCRIPTION("asynchronous raid6 syndrome generation/validation"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 78fb7780272a..56b5f98da463 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -62,7 +62,7 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, while (src_cnt) { submit->flags = flags_orig; dma_flags = 0; - xor_src_cnt = min(src_cnt, dma->max_xor); + xor_src_cnt = min(src_cnt, (int)dma->max_xor); /* if we are submitting additional xors, leave the chain open, * clear the callback parameters, and leave the destination * buffer mapped diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index e002e0e0d055..cd5673d3043b 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -646,6 +646,10 @@ int dma_async_device_register(struct dma_device *device) !device->device_prep_dma_xor); BUG_ON(dma_has_cap(DMA_XOR_VAL, device->cap_mask) && !device->device_prep_dma_xor_val); + BUG_ON(dma_has_cap(DMA_PQ, device->cap_mask) && + !device->device_prep_dma_pq); + BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) && + !device->device_prep_dma_pq_val); BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && !device->device_prep_dma_memset); BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 6ff79a672699..4496bc606662 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -1257,7 +1257,7 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " "( %s%s%s%s%s%s%s%s%s%s)\n", - dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", + dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 12a2efcbd565..e6ce5f004f98 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -185,5 +185,14 @@ async_memset(struct page *dest, int val, unsigned int offset, struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit); +struct dma_async_tx_descriptor * +async_gen_syndrome(struct page **blocks, unsigned int offset, int src_cnt, + size_t len, struct async_submit_ctl *submit); + +struct dma_async_tx_descriptor * +async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt, + size_t len, enum sum_check_flags *pqres, struct page *spare, + struct async_submit_ctl *submit); + void async_tx_quiesce(struct dma_async_tx_descriptor **tx); #endif /* _ASYNC_TX_H_ */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 02447afcebad..ce010cd991d2 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -52,7 +52,7 @@ enum dma_status { enum dma_transaction_type { DMA_MEMCPY, DMA_XOR, - DMA_PQ_XOR, + DMA_PQ, DMA_DUAL_XOR, DMA_PQ_UPDATE, DMA_XOR_VAL, @@ -70,20 +70,28 @@ enum dma_transaction_type { /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, - * control completion, and communicate status. + * control completion, and communicate status. * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of - * this transaction + * this transaction * @DMA_CTRL_ACK - the descriptor cannot be reused until the client - * acknowledges receipt, i.e. has has a chance to establish any - * dependency chains + * acknowledges receipt, i.e. has has a chance to establish any dependency + * chains * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) + * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q + * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P + * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as + * sources that were the result of a previous operation, in the case of a PQ + * operation it continues the calculation with new sources */ enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), DMA_CTRL_ACK = (1 << 1), DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2), DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), + DMA_PREP_PQ_DISABLE_P = (1 << 4), + DMA_PREP_PQ_DISABLE_Q = (1 << 5), + DMA_PREP_CONTINUE = (1 << 6), }; /** @@ -226,6 +234,7 @@ struct dma_async_tx_descriptor { * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags * @max_xor: maximum number of xor sources, 0 if no capability + * @max_pq: maximum number of PQ sources and PQ-continue capability * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @device_alloc_chan_resources: allocate resources and return the @@ -234,6 +243,8 @@ struct dma_async_tx_descriptor { * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation * @device_prep_dma_xor_val: prepares a xor validation operation + * @device_prep_dma_pq: prepares a pq operation + * @device_prep_dma_pq_val: prepares a pqzero_sum operation * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation @@ -248,7 +259,9 @@ struct dma_device { struct list_head channels; struct list_head global_node; dma_cap_mask_t cap_mask; - int max_xor; + unsigned short max_xor; + unsigned short max_pq; + #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; struct device *dev; @@ -265,6 +278,14 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, size_t len, enum sum_check_flags *result, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_pq)( + struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, + size_t len, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)( + struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + enum sum_check_flags *pqres, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags); @@ -283,6 +304,60 @@ struct dma_device { void (*device_issue_pending)(struct dma_chan *chan); }; +static inline void +dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) +{ + dma->max_pq = maxpq; + if (has_pq_continue) + dma->max_pq |= DMA_HAS_PQ_CONTINUE; +} + +static inline bool dmaf_continue(enum dma_ctrl_flags flags) +{ + return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE; +} + +static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags) +{ + enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P; + + return (flags & mask) == mask; +} + +static inline bool dma_dev_has_pq_continue(struct dma_device *dma) +{ + return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE; +} + +static unsigned short dma_dev_to_maxpq(struct dma_device *dma) +{ + return dma->max_pq & ~DMA_HAS_PQ_CONTINUE; +} + +/* dma_maxpq - reduce maxpq in the face of continued operations + * @dma - dma device with PQ capability + * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set + * + * When an engine does not support native continuation we need 3 extra + * source slots to reuse P and Q with the following coefficients: + * 1/ {00} * P : remove P from Q', but use it as a source for P' + * 2/ {01} * Q : use Q to continue Q' calculation + * 3/ {00} * Q : subtract Q from P' to cancel (2) + * + * In the case where P is disabled we only need 1 extra source: + * 1/ {01} * Q : use Q to continue Q' calculation + */ +static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) +{ + if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags)) + return dma_dev_to_maxpq(dma); + else if (dmaf_p_disabled_continue(flags)) + return dma_dev_to_maxpq(dma) - 1; + else if (dmaf_continue(flags)) + return dma_dev_to_maxpq(dma) - 3; + BUG(); +} + /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE -- cgit v1.2.3-71-gd317 From 0a82a6239beecc95db6e05fe43ee62d16b381d38 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 Jul 2009 12:20:37 -0700 Subject: async_tx: add support for asynchronous RAID6 recovery operations async_raid6_2data_recov() recovers two data disk failures async_raid6_datap_recov() recovers a data disk and the P disk These routines are a port of the synchronous versions found in drivers/md/raid6recov.c. The primary difference is breaking out the xor operations into separate calls to async_xor. Two helper routines are introduced to perform scalar multiplication where needed. async_sum_product() multiplies two sources by scalar coefficients and then sums (xor) the result. async_mult() simply multiplies a single source by a scalar. This implemention also includes, in contrast to the original synchronous-only code, special case handling for the 4-disk and 5-disk array cases. In these situations the default N-disk algorithm will present 0-source or 1-source operations to dma devices. To cover for dma devices where the minimum source count is 2 we implement 4-disk and 5-disk handling in the recovery code. [ Impact: asynchronous raid6 recovery routines for 2data and datap cases ] Cc: Yuri Tikhonov Cc: Ilya Yanok Cc: H. Peter Anvin Cc: David Woodhouse Reviewed-by: Andre Noll Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- Documentation/crypto/async-tx-api.txt | 4 + crypto/async_tx/Kconfig | 5 + crypto/async_tx/Makefile | 1 + crypto/async_tx/async_raid6_recov.c | 448 ++++++++++++++++++++++++++++++++++ include/linux/async_tx.h | 8 + 5 files changed, 466 insertions(+) create mode 100644 crypto/async_tx/async_raid6_recov.c (limited to 'include/linux') diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index 0e48e054d69a..ba046b8fa92f 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -67,6 +67,10 @@ xor_val - xor a series of source buffers and set a flag if the pq - generate the p+q (raid6 syndrome) from a series of source buffers pq_val - validate that a p and or q buffer are in sync with a given series of sources +datap - (raid6_datap_recov) recover a raid6 data block and the p block + from the given sources +2data - (raid6_2data_recov) recover 2 raid6 data blocks from the given + sources 3.3 Descriptor management: The return value is non-NULL and points to a 'descriptor' when the operation diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig index cb6d7314f198..e5aeb2b79e6f 100644 --- a/crypto/async_tx/Kconfig +++ b/crypto/async_tx/Kconfig @@ -18,3 +18,8 @@ config ASYNC_PQ tristate select ASYNC_CORE +config ASYNC_RAID6_RECOV + tristate + select ASYNC_CORE + select ASYNC_PQ + diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile index 1b9926588259..9a1a76811b80 100644 --- a/crypto/async_tx/Makefile +++ b/crypto/async_tx/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o obj-$(CONFIG_ASYNC_XOR) += async_xor.o obj-$(CONFIG_ASYNC_PQ) += async_pq.o +obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c new file mode 100644 index 000000000000..0c14d48c9896 --- /dev/null +++ b/crypto/async_tx/async_raid6_recov.c @@ -0,0 +1,448 @@ +/* + * Asynchronous RAID-6 recovery calculations ASYNC_TX API. + * Copyright(c) 2009 Intel Corporation + * + * based on raid6recov.c: + * Copyright 2002 H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include + +static struct dma_async_tx_descriptor * +async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, + size_t len, struct async_submit_ctl *submit) +{ + struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, + &dest, 1, srcs, 2, len); + struct dma_device *dma = chan ? chan->device : NULL; + const u8 *amul, *bmul; + u8 ax, bx; + u8 *a, *b, *c; + + if (dma) { + dma_addr_t dma_dest[2]; + dma_addr_t dma_src[2]; + struct device *dev = dma->dev; + struct dma_async_tx_descriptor *tx; + enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; + + dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); + dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE); + dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE); + tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef, + len, dma_flags); + if (tx) { + async_tx_submit(chan, tx, submit); + return tx; + } + } + + /* run the operation synchronously */ + async_tx_quiesce(&submit->depend_tx); + amul = raid6_gfmul[coef[0]]; + bmul = raid6_gfmul[coef[1]]; + a = page_address(srcs[0]); + b = page_address(srcs[1]); + c = page_address(dest); + + while (len--) { + ax = amul[*a++]; + bx = bmul[*b++]; + *c++ = ax ^ bx; + } + + return NULL; +} + +static struct dma_async_tx_descriptor * +async_mult(struct page *dest, struct page *src, u8 coef, size_t len, + struct async_submit_ctl *submit) +{ + struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, + &dest, 1, &src, 1, len); + struct dma_device *dma = chan ? chan->device : NULL; + const u8 *qmul; /* Q multiplier table */ + u8 *d, *s; + + if (dma) { + dma_addr_t dma_dest[2]; + dma_addr_t dma_src[1]; + struct device *dev = dma->dev; + struct dma_async_tx_descriptor *tx; + enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; + + dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); + dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE); + tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef, + len, dma_flags); + if (tx) { + async_tx_submit(chan, tx, submit); + return tx; + } + } + + /* no channel available, or failed to allocate a descriptor, so + * perform the operation synchronously + */ + async_tx_quiesce(&submit->depend_tx); + qmul = raid6_gfmul[coef]; + d = page_address(dest); + s = page_address(src); + + while (len--) + *d++ = qmul[*s++]; + + return NULL; +} + +static struct dma_async_tx_descriptor * +__2data_recov_4(size_t bytes, int faila, int failb, struct page **blocks, + struct async_submit_ctl *submit) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct page *p, *q, *a, *b; + struct page *srcs[2]; + unsigned char coef[2]; + enum async_tx_flags flags = submit->flags; + dma_async_tx_callback cb_fn = submit->cb_fn; + void *cb_param = submit->cb_param; + void *scribble = submit->scribble; + + p = blocks[4-2]; + q = blocks[4-1]; + + a = blocks[faila]; + b = blocks[failb]; + + /* in the 4 disk case P + Pxy == P and Q + Qxy == Q */ + /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ + srcs[0] = p; + srcs[1] = q; + coef[0] = raid6_gfexi[failb-faila]; + coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_sum_product(b, srcs, coef, bytes, submit); + + /* Dy = P+Pxy+Dx */ + srcs[0] = p; + srcs[1] = b; + init_async_submit(submit, flags | ASYNC_TX_XOR_ZERO_DST, tx, cb_fn, + cb_param, scribble); + tx = async_xor(a, srcs, 0, 2, bytes, submit); + + return tx; + +} + +static struct dma_async_tx_descriptor * +__2data_recov_5(size_t bytes, int faila, int failb, struct page **blocks, + struct async_submit_ctl *submit) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct page *p, *q, *g, *dp, *dq; + struct page *srcs[2]; + unsigned char coef[2]; + enum async_tx_flags flags = submit->flags; + dma_async_tx_callback cb_fn = submit->cb_fn; + void *cb_param = submit->cb_param; + void *scribble = submit->scribble; + int uninitialized_var(good); + int i; + + for (i = 0; i < 3; i++) { + if (i == faila || i == failb) + continue; + else { + good = i; + break; + } + } + BUG_ON(i >= 3); + + p = blocks[5-2]; + q = blocks[5-1]; + g = blocks[good]; + + /* Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for delta p and + * delta q + */ + dp = blocks[faila]; + dq = blocks[failb]; + + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_memcpy(dp, g, 0, 0, bytes, submit); + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit); + + /* compute P + Pxy */ + srcs[0] = dp; + srcs[1] = p; + init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL, + scribble); + tx = async_xor(dp, srcs, 0, 2, bytes, submit); + + /* compute Q + Qxy */ + srcs[0] = dq; + srcs[1] = q; + init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL, + scribble); + tx = async_xor(dq, srcs, 0, 2, bytes, submit); + + /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ + srcs[0] = dp; + srcs[1] = dq; + coef[0] = raid6_gfexi[failb-faila]; + coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_sum_product(dq, srcs, coef, bytes, submit); + + /* Dy = P+Pxy+Dx */ + srcs[0] = dp; + srcs[1] = dq; + init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, + cb_param, scribble); + tx = async_xor(dp, srcs, 0, 2, bytes, submit); + + return tx; +} + +static struct dma_async_tx_descriptor * +__2data_recov_n(int disks, size_t bytes, int faila, int failb, + struct page **blocks, struct async_submit_ctl *submit) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct page *p, *q, *dp, *dq; + struct page *srcs[2]; + unsigned char coef[2]; + enum async_tx_flags flags = submit->flags; + dma_async_tx_callback cb_fn = submit->cb_fn; + void *cb_param = submit->cb_param; + void *scribble = submit->scribble; + + p = blocks[disks-2]; + q = blocks[disks-1]; + + /* Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = blocks[faila]; + blocks[faila] = (void *)raid6_empty_zero_page; + blocks[disks-2] = dp; + dq = blocks[failb]; + blocks[failb] = (void *)raid6_empty_zero_page; + blocks[disks-1] = dq; + + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_gen_syndrome(blocks, 0, disks, bytes, submit); + + /* Restore pointer table */ + blocks[faila] = dp; + blocks[failb] = dq; + blocks[disks-2] = p; + blocks[disks-1] = q; + + /* compute P + Pxy */ + srcs[0] = dp; + srcs[1] = p; + init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL, + scribble); + tx = async_xor(dp, srcs, 0, 2, bytes, submit); + + /* compute Q + Qxy */ + srcs[0] = dq; + srcs[1] = q; + init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL, + scribble); + tx = async_xor(dq, srcs, 0, 2, bytes, submit); + + /* Dx = A*(P+Pxy) + B*(Q+Qxy) */ + srcs[0] = dp; + srcs[1] = dq; + coef[0] = raid6_gfexi[failb-faila]; + coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]; + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_sum_product(dq, srcs, coef, bytes, submit); + + /* Dy = P+Pxy+Dx */ + srcs[0] = dp; + srcs[1] = dq; + init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, + cb_param, scribble); + tx = async_xor(dp, srcs, 0, 2, bytes, submit); + + return tx; +} + +/** + * async_raid6_2data_recov - asynchronously calculate two missing data blocks + * @disks: number of disks in the RAID-6 array + * @bytes: block size + * @faila: first failed drive index + * @failb: second failed drive index + * @blocks: array of source pointers where the last two entries are p and q + * @submit: submission/completion modifiers + */ +struct dma_async_tx_descriptor * +async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + struct page **blocks, struct async_submit_ctl *submit) +{ + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + + pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); + + /* we need to preserve the contents of 'blocks' for the async + * case, so punt to synchronous if a scribble buffer is not available + */ + if (!submit->scribble) { + void **ptrs = (void **) blocks; + int i; + + async_tx_quiesce(&submit->depend_tx); + for (i = 0; i < disks; i++) + ptrs[i] = page_address(blocks[i]); + + raid6_2data_recov(disks, bytes, faila, failb, ptrs); + + async_tx_sync_epilog(submit); + + return NULL; + } + + switch (disks) { + case 4: + /* dma devices do not uniformly understand a zero source pq + * operation (in contrast to the synchronous case), so + * explicitly handle the 4 disk special case + */ + return __2data_recov_4(bytes, faila, failb, blocks, submit); + case 5: + /* dma devices do not uniformly understand a single + * source pq operation (in contrast to the synchronous + * case), so explicitly handle the 5 disk special case + */ + return __2data_recov_5(bytes, faila, failb, blocks, submit); + default: + return __2data_recov_n(disks, bytes, faila, failb, blocks, submit); + } +} +EXPORT_SYMBOL_GPL(async_raid6_2data_recov); + +/** + * async_raid6_datap_recov - asynchronously calculate a data and the 'p' block + * @disks: number of disks in the RAID-6 array + * @bytes: block size + * @faila: failed drive index + * @blocks: array of source pointers where the last two entries are p and q + * @submit: submission/completion modifiers + */ +struct dma_async_tx_descriptor * +async_raid6_datap_recov(int disks, size_t bytes, int faila, + struct page **blocks, struct async_submit_ctl *submit) +{ + struct dma_async_tx_descriptor *tx = NULL; + struct page *p, *q, *dq; + u8 coef; + enum async_tx_flags flags = submit->flags; + dma_async_tx_callback cb_fn = submit->cb_fn; + void *cb_param = submit->cb_param; + void *scribble = submit->scribble; + struct page *srcs[2]; + + pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); + + /* we need to preserve the contents of 'blocks' for the async + * case, so punt to synchronous if a scribble buffer is not available + */ + if (!scribble) { + void **ptrs = (void **) blocks; + int i; + + async_tx_quiesce(&submit->depend_tx); + for (i = 0; i < disks; i++) + ptrs[i] = page_address(blocks[i]); + + raid6_datap_recov(disks, bytes, faila, ptrs); + + async_tx_sync_epilog(submit); + + return NULL; + } + + p = blocks[disks-2]; + q = blocks[disks-1]; + + /* Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = blocks[faila]; + blocks[faila] = (void *)raid6_empty_zero_page; + blocks[disks-1] = dq; + + /* in the 4 disk case we only need to perform a single source + * multiplication + */ + if (disks == 4) { + int good = faila == 0 ? 1 : 0; + struct page *g = blocks[good]; + + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_memcpy(p, g, 0, 0, bytes, submit); + + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit); + } else { + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_gen_syndrome(blocks, 0, disks, bytes, submit); + } + + /* Restore pointer table */ + blocks[faila] = dq; + blocks[disks-1] = q; + + /* calculate g^{-faila} */ + coef = raid6_gfinv[raid6_gfexp[faila]]; + + srcs[0] = dq; + srcs[1] = q; + init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL, + scribble); + tx = async_xor(dq, srcs, 0, 2, bytes, submit); + + init_async_submit(submit, 0, tx, NULL, NULL, scribble); + tx = async_mult(dq, dq, coef, bytes, submit); + + srcs[0] = p; + srcs[1] = dq; + init_async_submit(submit, flags | ASYNC_TX_XOR_DROP_DST, tx, cb_fn, + cb_param, scribble); + tx = async_xor(p, srcs, 0, 2, bytes, submit); + + return tx; +} +EXPORT_SYMBOL_GPL(async_raid6_datap_recov); + +MODULE_AUTHOR("Dan Williams "); +MODULE_DESCRIPTION("asynchronous RAID-6 recovery api"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index e6ce5f004f98..866e61c4e2e0 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -194,5 +194,13 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int src_cnt, size_t len, enum sum_check_flags *pqres, struct page *spare, struct async_submit_ctl *submit); +struct dma_async_tx_descriptor * +async_raid6_2data_recov(int src_num, size_t bytes, int faila, int failb, + struct page **ptrs, struct async_submit_ctl *submit); + +struct dma_async_tx_descriptor * +async_raid6_datap_recov(int src_num, size_t bytes, int faila, + struct page **ptrs, struct async_submit_ctl *submit); + void async_tx_quiesce(struct dma_async_tx_descriptor **tx); #endif /* _ASYNC_TX_H_ */ -- cgit v1.2.3-71-gd317 From acdfcd04d9df7d084ff752f82afad6ed4ad5f363 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 28 Aug 2009 14:28:54 +0300 Subject: SLUB: fix ARCH_KMALLOC_MINALIGN cases 64 and 256 If the minalign is 64 bytes, then the 96 byte cache should not be created because it would conflict with the 128 byte cache. If the minalign is 256 bytes, patching the size_index table should not result in a buffer overrun. The calculation "(i - 1) / 8" used to access size_index[] is moved to a separate function as suggested by Christoph Lameter. Acked-by: Christoph Lameter Signed-off-by: Aaro Koskinen Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 6 ++---- mm/slub.c | 30 ++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 4dcbc2c71491..aa5d4a69d461 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -152,12 +152,10 @@ static __always_inline int kmalloc_index(size_t size) if (size <= KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; -#if KMALLOC_MIN_SIZE <= 64 - if (size > 64 && size <= 96) + if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96) return 1; - if (size > 128 && size <= 192) + if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192) return 2; -#endif if (size <= 8) return 3; if (size <= 16) return 4; if (size <= 32) return 5; diff --git a/mm/slub.c b/mm/slub.c index e16c9fb1f48b..be493bd63c31 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2825,6 +2825,11 @@ static s8 size_index[24] = { 2 /* 192 */ }; +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + static struct kmem_cache *get_slab(size_t size, gfp_t flags) { int index; @@ -2833,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) if (!size) return ZERO_SIZE_PTR; - index = size_index[(size - 1) / 8]; + index = size_index[size_index_elem(size)]; } else index = fls(size - 1); @@ -3188,10 +3193,12 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { + if (KMALLOC_MIN_SIZE <= 32) { create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_NOWAIT); caches++; + } + if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_NOWAIT); caches++; @@ -3218,17 +3225,28 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } - if (KMALLOC_MIN_SIZE == 128) { + if (KMALLOC_MIN_SIZE == 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + } else if (KMALLOC_MIN_SIZE == 128) { /* * The 192 byte sized cache is not used if the alignment * is 128 byte. Redirect kmalloc to use the 256 byte cache * instead. */ for (i = 128 + 8; i <= 192; i += 8) - size_index[(i - 1) / 8] = 8; + size_index[size_index_elem(i)] = 8; } slab_state = UP; -- cgit v1.2.3-71-gd317 From e500011ffa191d662ac64d4ada6a5187b3180e16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 30 Aug 2009 13:19:12 -0700 Subject: timers: Drop a function prototype Drop prototype for non-existent next_timer_interrupt() function. Signed-off-by: Randy Dunlap Cc: akpm LKML-Reference: <4A9ADEC0.70306@oracle.com> Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index be62ec2ebea5..a2d1eb6cb3f0 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -173,11 +173,6 @@ extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) -/* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base: - */ -extern unsigned long next_timer_interrupt(void); /* * Return when the next timer-wheel timeout occurs (in absolute jiffies), * locks the timer base and does the comparison against the given -- cgit v1.2.3-71-gd317 From 8aa84ad8d6c740a04386f599694609ee4998e82e Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Fri, 24 Jul 2009 15:25:05 +0200 Subject: [CPUFREQ] Introduce global, not per core: /sys/devices/system/cpu/cpufreq Currently everything in the cpufreq layer is per core based. This does not reflect reality, for example ondemand on conservative governors have global sysfs variables. Introduce a global cpufreq directory and add the kobject to the governor struct, so that governors can easily access it. The directory is initialized in the cpufreq_core_init initcall and thus will always be created if cpufreq is compiled in, even if no cpufreq driver is active later. Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 9 ++++++++- include/linux/cpufreq.h | 10 ++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index bbd5c2164ab6..4da28444b235 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -686,6 +686,9 @@ static struct attribute *default_attrs[] = { NULL }; +struct kobject *cpufreq_global_kobject; +EXPORT_SYMBOL(cpufreq_global_kobject); + #define to_policy(k) container_of(k, struct cpufreq_policy, kobj) #define to_attr(a) container_of(a, struct freq_attr, attr) @@ -1935,7 +1938,11 @@ static int __init cpufreq_core_init(void) per_cpu(policy_cpu, cpu) = -1; init_rwsem(&per_cpu(cpu_policy_rwsem, cpu)); } + + cpufreq_global_kobject = kobject_create_and_add("cpufreq", + &cpu_sysdev_class.kset.kobj); + BUG_ON(!cpufreq_global_kobject); + return 0; } - core_initcall(cpufreq_core_init); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 161042746afc..44717eb47639 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -65,6 +65,9 @@ static inline int cpufreq_unregister_notifier(struct notifier_block *nb, struct cpufreq_governor; +/* /sys/devices/system/cpu/cpufreq: entry point for global variables */ +extern struct kobject *cpufreq_global_kobject; + #define CPUFREQ_ETERNAL (-1) struct cpufreq_cpuinfo { unsigned int max_freq; @@ -274,6 +277,13 @@ struct freq_attr { ssize_t (*store)(struct cpufreq_policy *, const char *, size_t count); }; +struct global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); +}; /********************************************************************* * CPUFREQ 2.6. INTERFACE * -- cgit v1.2.3-71-gd317 From 6d703a81ad5fdd102334751ddacb053ecc6ff046 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 1 Sep 2009 17:52:57 -0700 Subject: ide: convert to ->proc_fops ->read_proc, ->write_proc are going away, ->proc_fops should be used instead. The only tricky place is IDENTIFY handling: if for some reason taskfile_lib_get_identify() fails, buffer _is_ changed and at least first byte is overwritten. Emulate old behaviour with returning that first byte to userspace and reporting length=1 despite overall -E. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/ide/ide-cd.c | 28 +++- drivers/ide/ide-disk_proc.c | 129 +++++++++++------ drivers/ide/ide-floppy_proc.c | 30 ++-- drivers/ide/ide-proc.c | 330 +++++++++++++++++++++++++++--------------- drivers/ide/ide-tape.c | 31 ++-- include/linux/ide.h | 24 +-- 6 files changed, 365 insertions(+), 207 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index ad0ab0c0a493..b79ca419d8d9 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1389,19 +1390,30 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive) return capacity * sectors_per_frame; } -static int proc_idecd_read_capacity(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int idecd_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = data; - int len; + ide_drive_t *drive = m->private; - len = sprintf(page, "%llu\n", (long long)ide_cdrom_capacity(drive)); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)ide_cdrom_capacity(drive)); + return 0; +} + +static int idecd_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idecd_capacity_proc_show, PDE(inode)->data); } +static const struct file_operations idecd_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idecd_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t idecd_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_idecd_read_capacity, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &idecd_capacity_proc_fops }, + {} }; static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive) diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c index 19f263bf0a9e..60b0590ccc9c 100644 --- a/drivers/ide/ide-disk_proc.c +++ b/drivers/ide/ide-disk_proc.c @@ -1,5 +1,6 @@ #include #include +#include #include "ide-disk.h" @@ -37,77 +38,117 @@ static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd) return ide_raw_taskfile(drive, &cmd, buf, 1); } -static int proc_idedisk_read_cache - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_cache_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len; + ide_drive_t *drive = (ide_drive_t *) m->private; if (drive->dev_flags & IDE_DFLAG_ID_READ) - len = sprintf(out, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2); + seq_printf(m, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2); else - len = sprintf(out, "(none)\n"); + seq_printf(m, "(none)\n"); + return 0; +} - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); +static int idedisk_cache_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idedisk_cache_proc_show, PDE(inode)->data); } -static int proc_idedisk_read_capacity - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations idedisk_cache_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_cache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int idedisk_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t*drive = (ide_drive_t *)data; - int len; + ide_drive_t*drive = (ide_drive_t *)m->private; - len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); + seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive)); + return 0; +} - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); +static int idedisk_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idedisk_capacity_proc_show, PDE(inode)->data); } -static int proc_idedisk_read_smart(char *page, char **start, off_t off, - int count, int *eof, void *data, u8 sub_cmd) +static const struct file_operations idedisk_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd) { - ide_drive_t *drive = (ide_drive_t *)data; - int len = 0, i = 0; + u8 *buf; + + buf = kmalloc(SECTOR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; (void)smart_enable(drive); - if (get_smart_data(drive, page, sub_cmd) == 0) { - unsigned short *val = (unsigned short *) page; - char *out = (char *)val + SECTOR_SIZE; - - page = out; - do { - out += sprintf(out, "%04x%c", le16_to_cpu(*val), - (++i & 7) ? ' ' : '\n'); - val += 1; - } while (i < SECTOR_SIZE / 2); - len = out - page; + if (get_smart_data(drive, buf, sub_cmd) == 0) { + __le16 *val = (__le16 *)buf; + int i; + + for (i = 0; i < SECTOR_SIZE / 2; i++) { + seq_printf(m, "%04x%c", le16_to_cpu(val[i]), + (i % 8) == 7 ? '\n' : ' '); + } } + kfree(buf); + return 0; +} - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); +static int idedisk_sv_proc_show(struct seq_file *m, void *v) +{ + return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES); } -static int proc_idedisk_read_sv - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_sv_proc_open(struct inode *inode, struct file *file) { - return proc_idedisk_read_smart(page, start, off, count, eof, data, - ATA_SMART_READ_VALUES); + return single_open(file, idedisk_sv_proc_show, PDE(inode)->data); } -static int proc_idedisk_read_st - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations idedisk_sv_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_sv_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int idedisk_st_proc_show(struct seq_file *m, void *v) { - return proc_idedisk_read_smart(page, start, off, count, eof, data, - ATA_SMART_READ_THRESHOLDS); + return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS); } +static int idedisk_st_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idedisk_st_proc_show, PDE(inode)->data); +} + +static const struct file_operations idedisk_st_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_st_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + ide_proc_entry_t ide_disk_proc[] = { - { "cache", S_IFREG|S_IRUGO, proc_idedisk_read_cache, NULL }, - { "capacity", S_IFREG|S_IRUGO, proc_idedisk_read_capacity, NULL }, - { "geometry", S_IFREG|S_IRUGO, proc_ide_read_geometry, NULL }, - { "smart_values", S_IFREG|S_IRUSR, proc_idedisk_read_sv, NULL }, - { "smart_thresholds", S_IFREG|S_IRUSR, proc_idedisk_read_st, NULL }, - { NULL, 0, NULL, NULL } + { "cache", S_IFREG|S_IRUGO, &idedisk_cache_proc_fops }, + { "capacity", S_IFREG|S_IRUGO, &idedisk_capacity_proc_fops }, + { "geometry", S_IFREG|S_IRUGO, &ide_geometry_proc_fops }, + { "smart_values", S_IFREG|S_IRUSR, &idedisk_sv_proc_fops }, + { "smart_thresholds", S_IFREG|S_IRUSR, &idedisk_st_proc_fops }, + {} }; ide_devset_rw_field(bios_cyl, bios_cyl); diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c index fcd4d8153df5..d711d9b883de 100644 --- a/drivers/ide/ide-floppy_proc.c +++ b/drivers/ide/ide-floppy_proc.c @@ -1,22 +1,34 @@ #include #include +#include #include "ide-floppy.h" -static int proc_idefloppy_read_capacity(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int idefloppy_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t*drive = (ide_drive_t *)data; - int len; + ide_drive_t*drive = (ide_drive_t *)m->private; - len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive)); + return 0; } +static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idefloppy_capacity_proc_show, PDE(inode)->data); +} + +static const struct file_operations idefloppy_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idefloppy_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + ide_proc_entry_t ide_floppy_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_idefloppy_read_capacity, NULL }, - { "geometry", S_IFREG|S_IRUGO, proc_ide_read_geometry, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &idefloppy_capacity_proc_fops }, + { "geometry", S_IFREG|S_IRUGO, &ide_geometry_proc_fops }, + {} }; ide_devset_rw_field(bios_cyl, bios_cyl); diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 021de41655e6..28d09a5d8450 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -30,11 +30,9 @@ static struct proc_dir_entry *proc_ide_root; -static int proc_ide_read_imodel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_imodel_proc_show(struct seq_file *m, void *v) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; + ide_hwif_t *hwif = (ide_hwif_t *) m->private; const char *name; switch (hwif->chipset) { @@ -53,63 +51,108 @@ static int proc_ide_read_imodel case ide_acorn: name = "acorn"; break; default: name = "(unknown)"; break; } - len = sprintf(page, "%s\n", name); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%s\n", name); + return 0; } -static int proc_ide_read_mate - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_imodel_proc_open(struct inode *inode, struct file *file) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; + return single_open(file, ide_imodel_proc_show, PDE(inode)->data); +} + +static const struct file_operations ide_imodel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_imodel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_mate_proc_show(struct seq_file *m, void *v) +{ + ide_hwif_t *hwif = (ide_hwif_t *) m->private; if (hwif && hwif->mate) - len = sprintf(page, "%s\n", hwif->mate->name); + seq_printf(m, "%s\n", hwif->mate->name); else - len = sprintf(page, "(none)\n"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "(none)\n"); + return 0; +} + +static int ide_mate_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_mate_proc_show, PDE(inode)->data); } -static int proc_ide_read_channel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_mate_proc_fops = { + .owner = THIS_MODULE, + .open = ide_mate_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_channel_proc_show(struct seq_file *m, void *v) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; + ide_hwif_t *hwif = (ide_hwif_t *) m->private; - page[0] = hwif->channel ? '1' : '0'; - page[1] = '\n'; - len = 2; - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%c\n", hwif->channel ? '1' : '0'); + return 0; } -static int proc_ide_read_identify - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_channel_proc_open(struct inode *inode, struct file *file) { - ide_drive_t *drive = (ide_drive_t *)data; - int len = 0, i = 0; - int err = 0; + return single_open(file, ide_channel_proc_show, PDE(inode)->data); +} - len = sprintf(page, "\n"); +static const struct file_operations ide_channel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_channel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; - if (drive) { - __le16 *val = (__le16 *)page; +static int ide_identify_proc_show(struct seq_file *m, void *v) +{ + ide_drive_t *drive = (ide_drive_t *)m->private; + u8 *buf; - err = taskfile_lib_get_identify(drive, page); - if (!err) { - char *out = (char *)page + SECTOR_SIZE; + if (!drive) { + seq_putc(m, '\n'); + return 0; + } - page = out; - do { - out += sprintf(out, "%04x%c", - le16_to_cpup(val), (++i & 7) ? ' ' : '\n'); - val += 1; - } while (i < SECTOR_SIZE / 2); - len = out - page; + buf = kmalloc(SECTOR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + if (taskfile_lib_get_identify(drive, buf) == 0) { + __le16 *val = (__le16 *)buf; + int i; + + for (i = 0; i < SECTOR_SIZE / 2; i++) { + seq_printf(m, "%04x%c", le16_to_cpu(val[i]), + (i % 8) == 7 ? '\n' : ' '); } - } - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + } else + seq_putc(m, buf[0]); + kfree(buf); + return 0; +} + +static int ide_identify_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_identify_proc_show, PDE(inode)->data); } +static const struct file_operations ide_identify_proc_fops = { + .owner = THIS_MODULE, + .open = ide_identify_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * ide_find_setting - find a specific setting * @st: setting table pointer @@ -240,22 +283,20 @@ static void proc_ide_settings_warn(void) warned = 1; } -static int proc_ide_read_settings - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_settings_proc_show(struct seq_file *m, void *v) { const struct ide_proc_devset *setting, *g, *d; const struct ide_devset *ds; - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len, rc, mul_factor, div_factor; + ide_drive_t *drive = (ide_drive_t *) m->private; + int rc, mul_factor, div_factor; proc_ide_settings_warn(); mutex_lock(&ide_setting_mtx); g = ide_generic_settings; d = drive->settings; - out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); - out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); + seq_printf(m, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); + seq_printf(m, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); while (g->name || (d && d->name)) { /* read settings in the alphabetical order */ if (g->name && d && d->name) { @@ -269,31 +310,35 @@ static int proc_ide_read_settings setting = g++; mul_factor = setting->mulf ? setting->mulf(drive) : 1; div_factor = setting->divf ? setting->divf(drive) : 1; - out += sprintf(out, "%-24s", setting->name); + seq_printf(m, "%-24s", setting->name); rc = ide_read_setting(drive, setting); if (rc >= 0) - out += sprintf(out, "%-16d", rc * mul_factor / div_factor); + seq_printf(m, "%-16d", rc * mul_factor / div_factor); else - out += sprintf(out, "%-16s", "write-only"); - out += sprintf(out, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor); + seq_printf(m, "%-16s", "write-only"); + seq_printf(m, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor); ds = setting->setting; if (ds->get) - out += sprintf(out, "r"); + seq_printf(m, "r"); if (ds->set) - out += sprintf(out, "w"); - out += sprintf(out, "\n"); + seq_printf(m, "w"); + seq_printf(m, "\n"); } - len = out - page; mutex_unlock(&ide_setting_mtx); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + return 0; +} + +static int ide_settings_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_settings_proc_show, PDE(inode)->data); } #define MAX_LEN 30 -static int proc_ide_write_settings(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t ide_settings_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data; char name[MAX_LEN + 1]; int for_real = 0, mul_factor, div_factor; unsigned long n; @@ -388,63 +433,104 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer, return count; parse_error: free_page((unsigned long)buf); - printk("proc_ide_write_settings(): parse error\n"); + printk("%s(): parse error\n", __func__); return -EINVAL; } -int proc_ide_read_capacity - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_settings_proc_fops = { + .owner = THIS_MODULE, + .open = ide_settings_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ide_settings_proc_write, +}; + +static int ide_capacity_proc_show(struct seq_file *m, void *v) { - int len = sprintf(page, "%llu\n", (long long)0x7fffffff); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)0x7fffffff); + return 0; } -EXPORT_SYMBOL_GPL(proc_ide_read_capacity); +static int ide_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_capacity_proc_show, NULL); +} -int proc_ide_read_geometry - (char *page, char **start, off_t off, int count, int *eof, void *data) +const struct file_operations ide_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = ide_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL_GPL(ide_capacity_proc_fops); + +static int ide_geometry_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len; + ide_drive_t *drive = (ide_drive_t *) m->private; - out += sprintf(out, "physical %d/%d/%d\n", + seq_printf(m, "physical %d/%d/%d\n", drive->cyl, drive->head, drive->sect); - out += sprintf(out, "logical %d/%d/%d\n", + seq_printf(m, "logical %d/%d/%d\n", drive->bios_cyl, drive->bios_head, drive->bios_sect); + return 0; +} - len = out - page; - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); +static int ide_geometry_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_geometry_proc_show, PDE(inode)->data); } -EXPORT_SYMBOL(proc_ide_read_geometry); +const struct file_operations ide_geometry_proc_fops = { + .owner = THIS_MODULE, + .open = ide_geometry_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(ide_geometry_proc_fops); -static int proc_ide_read_dmodel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_dmodel_proc_show(struct seq_file *seq, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) seq->private; char *m = (char *)&drive->id[ATA_ID_PROD]; - int len; - len = sprintf(page, "%.40s\n", m[0] ? m : "(none)"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(seq, "%.40s\n", m[0] ? m : "(none)"); + return 0; +} + +static int ide_dmodel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_dmodel_proc_show, PDE(inode)->data); } -static int proc_ide_read_driver - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_dmodel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_dmodel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_driver_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *)data; + ide_drive_t *drive = (ide_drive_t *)m->private; struct device *dev = &drive->gendev; struct ide_driver *ide_drv; - int len; if (dev->driver) { ide_drv = to_ide_driver(dev->driver); - len = sprintf(page, "%s version %s\n", + seq_printf(m, "%s version %s\n", dev->driver->name, ide_drv->version); } else - len = sprintf(page, "ide-default version 0.9.newide\n"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "ide-default version 0.9.newide\n"); + return 0; +} + +static int ide_driver_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_driver_proc_show, PDE(inode)->data); } static int ide_replace_subdriver(ide_drive_t *drive, const char *driver) @@ -474,10 +560,10 @@ static int ide_replace_subdriver(ide_drive_t *drive, const char *driver) return ret; } -static int proc_ide_write_driver - (struct file *file, const char __user *buffer, unsigned long count, void *data) +static ssize_t ide_driver_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data; char name[32]; if (!capable(CAP_SYS_ADMIN)) @@ -492,12 +578,19 @@ static int proc_ide_write_driver return count; } -static int proc_ide_read_media - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_driver_proc_fops = { + .owner = THIS_MODULE, + .open = ide_driver_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ide_driver_proc_write, +}; + +static int ide_media_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) m->private; const char *media; - int len; switch (drive->media) { case ide_disk: media = "disk\n"; break; @@ -507,20 +600,30 @@ static int proc_ide_read_media case ide_optical: media = "optical\n"; break; default: media = "UNKNOWN\n"; break; } - strcpy(page, media); - len = strlen(media); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_puts(m, media); + return 0; +} + +static int ide_media_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_media_proc_show, PDE(inode)->data); } +static const struct file_operations ide_media_proc_fops = { + .owner = THIS_MODULE, + .open = ide_media_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t generic_drive_entries[] = { - { "driver", S_IFREG|S_IRUGO, proc_ide_read_driver, - proc_ide_write_driver }, - { "identify", S_IFREG|S_IRUSR, proc_ide_read_identify, NULL }, - { "media", S_IFREG|S_IRUGO, proc_ide_read_media, NULL }, - { "model", S_IFREG|S_IRUGO, proc_ide_read_dmodel, NULL }, - { "settings", S_IFREG|S_IRUSR|S_IWUSR, proc_ide_read_settings, - proc_ide_write_settings }, - { NULL, 0, NULL, NULL } + { "driver", S_IFREG|S_IRUGO, &ide_driver_proc_fops }, + { "identify", S_IFREG|S_IRUSR, &ide_identify_proc_fops}, + { "media", S_IFREG|S_IRUGO, &ide_media_proc_fops }, + { "model", S_IFREG|S_IRUGO, &ide_dmodel_proc_fops }, + { "settings", S_IFREG|S_IRUSR|S_IWUSR, &ide_settings_proc_fops}, + {} }; static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data) @@ -530,11 +633,8 @@ static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p if (!dir || !p) return; while (p->name != NULL) { - ent = create_proc_entry(p->name, p->mode, dir); + ent = proc_create_data(p->name, p->mode, dir, p->proc_fops, data); if (!ent) return; - ent->data = data; - ent->read_proc = p->read_proc; - ent->write_proc = p->write_proc; p++; } } @@ -617,10 +717,10 @@ void ide_proc_unregister_device(ide_drive_t *drive) } static ide_proc_entry_t hwif_entries[] = { - { "channel", S_IFREG|S_IRUGO, proc_ide_read_channel, NULL }, - { "mate", S_IFREG|S_IRUGO, proc_ide_read_mate, NULL }, - { "model", S_IFREG|S_IRUGO, proc_ide_read_imodel, NULL }, - { NULL, 0, NULL, NULL } + { "channel", S_IFREG|S_IRUGO, &ide_channel_proc_fops }, + { "mate", S_IFREG|S_IRUGO, &ide_mate_proc_fops }, + { "model", S_IFREG|S_IRUGO, &ide_imodel_proc_fops }, + {} }; void ide_proc_register_port(ide_hwif_t *hwif) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 7b2032bc357b..9d6f62baac27 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -1816,22 +1817,32 @@ static void ide_tape_release(struct device *dev) } #ifdef CONFIG_IDE_PROC_FS -static int proc_idetape_read_name - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idetape_name_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) m->private; idetape_tape_t *tape = drive->driver_data; - char *out = page; - int len; - len = sprintf(out, "%s\n", tape->name); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%s\n", tape->name); + return 0; +} + +static int idetape_name_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idetape_name_proc_show, PDE(inode)->data); } +static const struct file_operations idetape_name_proc_fops = { + .owner = THIS_MODULE, + .open = idetape_name_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t idetape_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL }, - { "name", S_IFREG|S_IRUGO, proc_idetape_read_name, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &ide_capacity_proc_fops }, + { "name", S_IFREG|S_IRUGO, &idetape_name_proc_fops }, + {} }; static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive) diff --git a/include/linux/ide.h b/include/linux/ide.h index 803c1ae31237..e4135d6e0556 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -919,8 +919,7 @@ __IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL) typedef struct { const char *name; mode_t mode; - read_proc_t *read_proc; - write_proc_t *write_proc; + const struct file_operations *proc_fops; } ide_proc_entry_t; void proc_ide_create(void); @@ -932,24 +931,8 @@ void ide_proc_unregister_port(ide_hwif_t *); void ide_proc_register_driver(ide_drive_t *, struct ide_driver *); void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *); -read_proc_t proc_ide_read_capacity; -read_proc_t proc_ide_read_geometry; - -/* - * Standard exit stuff: - */ -#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) \ -{ \ - len -= off; \ - if (len < count) { \ - *eof = 1; \ - if (len <= 0) \ - return 0; \ - } else \ - len = count; \ - *start = page + off; \ - return len; \ -} +extern const struct file_operations ide_capacity_proc_fops; +extern const struct file_operations ide_geometry_proc_fops; #else static inline void proc_ide_create(void) { ; } static inline void proc_ide_destroy(void) { ; } @@ -961,7 +944,6 @@ static inline void ide_proc_register_driver(ide_drive_t *drive, struct ide_driver *driver) { ; } static inline void ide_proc_unregister_driver(ide_drive_t *drive, struct ide_driver *driver) { ; } -#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0; #endif enum { -- cgit v1.2.3-71-gd317 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 + arch/x86/include/asm/tboot.h | 197 ------------------------------------------ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 ++++++++++--- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- include/linux/tboot.h | 162 ++++++++++++++++++++++++++++++++++ init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 13 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/x86/include/asm/tboot.h create mode 100644 include/linux/tboot.h (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8b..b66f2102c35d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h deleted file mode 100644 index b13929d4e5f4..000000000000 --- a/arch/x86/include/asm/tboot.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * tboot.h: shared data structure with tboot and kernel and functions - * used by kernel for runtime support of Intel(R) Trusted - * Execution Technology - * - * Copyright (c) 2006-2009, Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef _ASM_TBOOT_H -#define _ASM_TBOOT_H - -#include - -/* these must have the values from 0-5 in this order */ -enum { - TB_SHUTDOWN_REBOOT = 0, - TB_SHUTDOWN_S5, - TB_SHUTDOWN_S4, - TB_SHUTDOWN_S3, - TB_SHUTDOWN_HALT, - TB_SHUTDOWN_WFS -}; - -#ifdef CONFIG_INTEL_TXT - -/* used to communicate between tboot and the launched kernel */ - -#define TB_KEY_SIZE 64 /* 512 bits */ - -#define MAX_TB_MAC_REGIONS 32 - -struct tboot_mac_region { - u64 start; /* must be 64 byte -aligned */ - u32 size; /* must be 64 byte -granular */ -} __packed; - -/* GAS - Generic Address Structure (ACPI 2.0+) */ -struct tboot_acpi_generic_address { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_width; - u64 address; -} __packed; - -/* - * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec - * (http://www.acpi.info/) - */ -struct tboot_acpi_sleep_info { - struct tboot_acpi_generic_address pm1a_cnt_blk; - struct tboot_acpi_generic_address pm1b_cnt_blk; - struct tboot_acpi_generic_address pm1a_evt_blk; - struct tboot_acpi_generic_address pm1b_evt_blk; - u16 pm1a_cnt_val; - u16 pm1b_cnt_val; - u64 wakeup_vector; - u32 vector_width; - u64 kernel_s3_resume_vector; -} __packed; - -/* - * shared memory page used for communication between tboot and kernel - */ -struct tboot { - /* - * version 3+ fields: - */ - - /* TBOOT_UUID */ - u8 uuid[16]; - - /* version number: 5 is current */ - u32 version; - - /* physical addr of tb_log_t log */ - u32 log_addr; - - /* - * physical addr of entry point for tboot shutdown and - * type of shutdown (TB_SHUTDOWN_*) being requested - */ - u32 shutdown_entry; - u32 shutdown_type; - - /* kernel-specified ACPI info for Sx shutdown */ - struct tboot_acpi_sleep_info acpi_sinfo; - - /* tboot location in memory (physical) */ - u32 tboot_base; - u32 tboot_size; - - /* memory regions (phys addrs) for tboot to MAC on S3 */ - u8 num_mac_regions; - struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; - - - /* - * version 4+ fields: - */ - - /* symmetric key for use by kernel; will be encrypted on S3 */ - u8 s3_key[TB_KEY_SIZE]; - - - /* - * version 5+ fields: - */ - - /* used to 4byte-align num_in_wfs */ - u8 reserved_align[3]; - - /* number of processors in wait-for-SIPI */ - u32 num_in_wfs; -} __packed; - -/* - * UUID for tboot data struct to facilitate matching - * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is - * represented as {} in the char array used here - */ -#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ - 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} - -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - -extern void tboot_probe(void); -extern void tboot_create_trampoline(void); -extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); -extern int tboot_wait_for_aps(int num_aps); -extern struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl); -extern int tboot_force_iommu(void); - -#else /* CONFIG_INTEL_TXT */ - -static inline int tboot_enabled(void) -{ - return 0; -} - -static inline void tboot_probe(void) -{ -} - -static inline void tboot_create_trampoline(void) -{ -} - -static inline void tboot_shutdown(u32 shutdown_type) -{ -} - -static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ -} - -static inline int tboot_wait_for_aps(int num_aps) -{ - return 0; -} - -static inline struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl) -{ - return dmar_tbl; -} - -static inline int tboot_force_iommu(void) -{ - return 0; -} - -#endif /* !CONFIG_INTEL_TXT */ - -#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d9794..18ce5c04242a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e32483..6ce0d6f38f7f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include