From 1fa44ecad2b86475e038aed81b0bf333fa484f8b Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 23 Feb 2006 12:43:43 -0600 Subject: [SCSI] add execute_in_process_context() API We have several points in the SCSI stack (primarily for our device functions) where we need to guarantee process context, but (given the place where the last reference was released) we cannot guarantee this. This API gets around the issue by executing the function directly if the caller has process context, but scheduling a workqueue to execute in process context if the caller doesn't have it. Signed-off-by: James Bottomley --- include/linux/workqueue.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 86b111300231..957c21c16d62 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -20,6 +20,10 @@ struct work_struct { struct timer_list timer; }; +struct execute_work { + struct work_struct work; +}; + #define __WORK_INITIALIZER(n, f, d) { \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ @@ -74,6 +78,8 @@ extern void init_workqueues(void); void cancel_rearming_delayed_work(struct work_struct *work); void cancel_rearming_delayed_workqueue(struct workqueue_struct *, struct work_struct *); +int execute_in_process_context(void (*fn)(void *), void *, + struct execute_work *); /* * Kill off a pending schedule_delayed_work(). Note that the work callback -- cgit v1.2.3-71-gd317 From 044cc6c8ec311c4ddeebfcc31c53dea282de70b7 Mon Sep 17 00:00:00 2001 From: "andrew.vasquez@qlogic.com" Date: Thu, 9 Mar 2006 14:27:13 -0800 Subject: [SCSI] qla2xxx: Add ISP54xx support. Chip is similar in form to our ISP24xx offering. Signed-off-by: Andrew Vasquez Signed-off-by: James Bottomley --- drivers/scsi/qla2xxx/ql2400.c | 27 +++++++++++++++++++++++++ drivers/scsi/qla2xxx/qla_attr.c | 8 ++++---- drivers/scsi/qla2xxx/qla_def.h | 12 +++++------ drivers/scsi/qla2xxx/qla_gs.c | 10 ++++------ drivers/scsi/qla2xxx/qla_init.c | 14 ++++++------- drivers/scsi/qla2xxx/qla_inline.h | 2 +- drivers/scsi/qla2xxx/qla_iocb.c | 6 +++--- drivers/scsi/qla2xxx/qla_isr.c | 16 +++++++-------- drivers/scsi/qla2xxx/qla_mbx.c | 42 +++++++++++++++++++-------------------- drivers/scsi/qla2xxx/qla_os.c | 18 ++++++++++------- include/linux/pci_ids.h | 2 ++ 11 files changed, 94 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/drivers/scsi/qla2xxx/ql2400.c b/drivers/scsi/qla2xxx/ql2400.c index 6c7165f47e29..77914fcfa2bc 100644 --- a/drivers/scsi/qla2xxx/ql2400.c +++ b/drivers/scsi/qla2xxx/ql2400.c @@ -49,6 +49,18 @@ static struct qla_board_info qla_board_tbl[] = { .fw_info = qla_fw_tbl, .fw_fname = "ql2400_fw.bin", }, + { + .drv_name = qla_driver_name, + .isp_name = "ISP5422", + .fw_info = qla_fw_tbl, + .fw_fname = "ql2400_fw.bin", + }, + { + .drv_name = qla_driver_name, + .isp_name = "ISP5432", + .fw_info = qla_fw_tbl, + .fw_fname = "ql2400_fw.bin", + }, }; static struct pci_device_id qla24xx_pci_tbl[] = { @@ -66,6 +78,21 @@ static struct pci_device_id qla24xx_pci_tbl[] = { .subdevice = PCI_ANY_ID, .driver_data = (unsigned long)&qla_board_tbl[1], }, + { + .vendor = PCI_VENDOR_ID_QLOGIC, + .device = PCI_DEVICE_ID_QLOGIC_ISP5422, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long)&qla_board_tbl[2], + }, + { + .vendor = PCI_VENDOR_ID_QLOGIC, + .device = PCI_DEVICE_ID_QLOGIC_ISP5432, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long)&qla_board_tbl[3], + }, + {0, 0}, }; MODULE_DEVICE_TABLE(pci, qla24xx_pci_tbl); diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 92b3e13e9061..2b9e329a240c 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -50,7 +50,7 @@ qla2x00_sysfs_write_fw_dump(struct kobject *kobj, char *buf, loff_t off, ha->host_no); vfree(ha->fw_dump_buffer); - if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) + if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) free_pages((unsigned long)ha->fw_dump, ha->fw_dump_order); @@ -64,7 +64,7 @@ qla2x00_sysfs_write_fw_dump(struct kobject *kobj, char *buf, loff_t off, if ((ha->fw_dump || ha->fw_dumped) && !ha->fw_dump_reading) { ha->fw_dump_reading = 1; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) dump_size = FW_DUMP_SIZE_24XX; else { dump_size = FW_DUMP_SIZE_1M; @@ -138,7 +138,7 @@ qla2x00_sysfs_write_nvram(struct kobject *kobj, char *buf, loff_t off, return 0; /* Checksum NVRAM. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { uint32_t *iter; uint32_t chksum; @@ -750,7 +750,7 @@ qla2x00_get_fc_host_stats(struct Scsi_Host *shost) pfc_host_stat = &ha->fc_host_stat; memset(pfc_host_stat, -1, sizeof(struct fc_host_statistics)); - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { rval = qla24xx_get_isp_stats(ha, (uint32_t *)&stat_buf, sizeof(stat_buf) / 4, mb_stat); } else { diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 00b7e82b99b3..e1a7769008ee 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -2234,9 +2234,9 @@ typedef struct scsi_qla_host { #define DT_ISP6322 BIT_6 #define DT_ISP2422 BIT_7 #define DT_ISP2432 BIT_8 -#define DT_ISP2512 BIT_9 -#define DT_ISP2522 BIT_10 -#define DT_ISP_LAST (DT_ISP2522 << 1) +#define DT_ISP5422 BIT_9 +#define DT_ISP5432 BIT_10 +#define DT_ISP_LAST (DT_ISP5432 << 1) #define DT_OEM_001 BIT_29 #define DT_ISP2200A BIT_30 @@ -2252,13 +2252,13 @@ typedef struct scsi_qla_host { #define IS_QLA6322(ha) (DT_MASK(ha) & DT_ISP6322) #define IS_QLA2422(ha) (DT_MASK(ha) & DT_ISP2422) #define IS_QLA2432(ha) (DT_MASK(ha) & DT_ISP2432) -#define IS_QLA2512(ha) (DT_MASK(ha) & DT_ISP2512) -#define IS_QLA2522(ha) (DT_MASK(ha) & DT_ISP2522) +#define IS_QLA5422(ha) (DT_MASK(ha) & DT_ISP5422) +#define IS_QLA5432(ha) (DT_MASK(ha) & DT_ISP5432) #define IS_QLA23XX(ha) (IS_QLA2300(ha) || IS_QLA2312(ha) || IS_QLA2322(ha) || \ IS_QLA6312(ha) || IS_QLA6322(ha)) #define IS_QLA24XX(ha) (IS_QLA2422(ha) || IS_QLA2432(ha)) -#define IS_QLA25XX(ha) (IS_QLA2512(ha) || IS_QLA2522(ha)) +#define IS_QLA54XX(ha) (IS_QLA5422(ha) || IS_QLA5432(ha)) #define IS_OEM_001(ha) ((ha)->device_type & DT_OEM_001) #define HAS_EXTENDED_IDS(ha) ((ha)->device_type & DT_EXTENDED_IDS) diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c index d620a8e8a614..2ebf259fccb2 100644 --- a/drivers/scsi/qla2xxx/qla_gs.c +++ b/drivers/scsi/qla2xxx/qla_gs.c @@ -126,7 +126,7 @@ qla2x00_chk_ms_status(scsi_qla_host_t *ha, ms_iocb_entry_t *ms_pkt, DEBUG2_3(printk("scsi(%ld): %s failed, error status (%x).\n", ha->host_no, routine, ms_pkt->entry_status)); } else { - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) comp_status = ((struct ct_entry_24xx *)ms_pkt)->comp_status; else @@ -1200,7 +1200,7 @@ qla2x00_update_ms_fdmi_iocb(scsi_qla_host_t *ha, uint32_t req_size) ms_iocb_entry_t *ms_pkt = ha->ms_iocb; struct ct_entry_24xx *ct_pkt = (struct ct_entry_24xx *)ha->ms_iocb; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { ct_pkt->cmd_byte_count = cpu_to_le32(req_size); ct_pkt->dseg_0_len = ct_pkt->cmd_byte_count; } else { @@ -1529,9 +1529,7 @@ qla2x00_fdmi_rpa(scsi_qla_host_t *ha) eiter = (struct ct_fdmi_port_attr *) (entries + size); eiter->type = __constant_cpu_to_be16(FDMI_PORT_SUPPORT_SPEED); eiter->len = __constant_cpu_to_be16(4 + 4); - if (IS_QLA25XX(ha)) - eiter->a.sup_speed = __constant_cpu_to_be32(8); - else if (IS_QLA24XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) eiter->a.sup_speed = __constant_cpu_to_be32(4); else if (IS_QLA23XX(ha)) eiter->a.sup_speed = __constant_cpu_to_be32(2); @@ -1566,7 +1564,7 @@ qla2x00_fdmi_rpa(scsi_qla_host_t *ha) eiter = (struct ct_fdmi_port_attr *) (entries + size); eiter->type = __constant_cpu_to_be16(FDMI_PORT_MAX_FRAME_SIZE); eiter->len = __constant_cpu_to_be16(4 + 4); - max_frame_size = IS_QLA24XX(ha) || IS_QLA25XX(ha) ? + max_frame_size = IS_QLA24XX(ha) || IS_QLA54XX(ha) ? (uint32_t) icb24->frame_payload_size: (uint32_t) ha->init_cb->frame_payload_size; eiter->a.max_frame_size = cpu_to_be32(max_frame_size); diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index f49eb06d0dbd..e6a2292a2892 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -387,7 +387,7 @@ qla2x00_isp_firmware(scsi_qla_host_t *ha) /* Verify checksum of loaded RISC code. */ rval = qla2x00_verify_checksum(ha, - IS_QLA24XX(ha) || IS_QLA25XX(ha) ? RISC_SADDRESS : + IS_QLA24XX(ha) || IS_QLA54XX(ha) ? RISC_SADDRESS : *ha->brd_info->fw_info[0].fwstart); } @@ -822,7 +822,7 @@ qla2x00_resize_request_q(scsi_qla_host_t *ha) if (IS_QLA2100(ha) || IS_QLA2200(ha)) return; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) qla2x00_alloc_fw_dump(ha); /* Retrieve IOCB counts available to the firmware. */ @@ -2123,7 +2123,7 @@ qla2x00_configure_fabric(scsi_qla_host_t *ha) LIST_HEAD(new_fcports); /* If FL port exists, then SNS is present */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) loop_id = NPH_F_PORT; else loop_id = SNS_FL_PORT; @@ -2149,7 +2149,7 @@ qla2x00_configure_fabric(scsi_qla_host_t *ha) qla2x00_fdmi_register(ha); /* Ensure we are logged into the SNS. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) loop_id = NPH_SNS; else loop_id = SIMPLE_NAME_SERVER; @@ -2640,7 +2640,7 @@ qla2x00_device_resync(scsi_qla_host_t *ha) if (ql2xprocessrscn && !IS_QLA2100(ha) && !IS_QLA2200(ha) && !IS_QLA6312(ha) && !IS_QLA6322(ha) && - !IS_QLA24XX(ha) && !IS_QLA25XX(ha) && + !IS_QLA24XX(ha) && !IS_QLA54XX(ha) && ha->flags.init_done) { /* Handle port RSCN via asyncronous IOCBs */ rval2 = qla2x00_handle_port_rscn(ha, rscn_entry, @@ -3130,7 +3130,7 @@ qla2x00_restart_isp(scsi_qla_host_t *ha) spin_lock_irqsave(&ha->hardware_lock, flags); - if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) { + if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) { /* * Disable SRAM, Instruction RAM and GP RAM * parity. @@ -3146,7 +3146,7 @@ qla2x00_restart_isp(scsi_qla_host_t *ha) spin_lock_irqsave(&ha->hardware_lock, flags); - if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) { + if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) { /* Enable proper parity */ if (IS_QLA2300(ha)) /* SRAM parity */ diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h index ecc3741a452e..45007ee58067 100644 --- a/drivers/scsi/qla2xxx/qla_inline.h +++ b/drivers/scsi/qla2xxx/qla_inline.h @@ -163,7 +163,7 @@ static inline int qla2x00_is_reserved_id(scsi_qla_host_t *, uint16_t); static inline int qla2x00_is_reserved_id(scsi_qla_host_t *ha, uint16_t loop_id) { - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) return (loop_id > NPH_LAST_HANDLE); return ((loop_id > ha->last_loop_id && loop_id < SNS_FIRST_LOOP_ID) || diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 6544b6d0891d..8f0f4a298357 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -466,7 +466,7 @@ __qla2x00_marker(scsi_qla_host_t *ha, uint16_t loop_id, uint16_t lun, mrk->entry_type = MARKER_TYPE; mrk->modifier = type; if (type != MK_SYNC_ALL) { - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mrk24 = (struct mrk_entry_24xx *) mrk; mrk24->nport_handle = cpu_to_le16(loop_id); mrk24->lun[1] = LSB(lun); @@ -519,7 +519,7 @@ qla2x00_req_pkt(scsi_qla_host_t *ha) for (timer = HZ; timer; timer--) { if ((req_cnt + 2) >= ha->req_q_cnt) { /* Calculate number of free request entries. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) cnt = (uint16_t)RD_REG_DWORD( ®->isp24.req_q_out); else @@ -593,7 +593,7 @@ qla2x00_isp_cmd(scsi_qla_host_t *ha) ha->request_ring_ptr++; /* Set chip new ring index. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { WRT_REG_DWORD(®->isp24.req_q_in, ha->req_ring_index); RD_REG_DWORD_RELAXED(®->isp24.req_q_in); } else { diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index c15458c2bf32..2003dbb70579 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -343,7 +343,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb) ha->isp_ops.fw_dump(ha, 1); - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { if (mb[1] == 0 && mb[2] == 0) { qla_printk(KERN_ERR, ha, "Unrecoverable Hardware Error: adapter " @@ -521,7 +521,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb) */ if (ql2xprocessrscn && !IS_QLA2100(ha) && !IS_QLA2200(ha) && !IS_QLA6312(ha) && - !IS_QLA6322(ha) && !IS_QLA24XX(ha) && !IS_QLA25XX(ha) && + !IS_QLA6322(ha) && !IS_QLA24XX(ha) && !IS_QLA54XX(ha) && ha->flags.init_done && mb[1] != 0xffff && ((ha->operating_mode == P2P && mb[1] != 0) || (ha->operating_mode != P2P && mb[1] != @@ -638,7 +638,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb) "scsi(%ld): [R|Z]IO update completion.\n", ha->host_no)); - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) qla24xx_process_response_queue(ha); else qla2x00_process_response_queue(ha); @@ -810,7 +810,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt) sts = (sts_entry_t *) pkt; sts24 = (struct sts_entry_24xx *) pkt; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { comp_status = le16_to_cpu(sts24->comp_status); scsi_status = le16_to_cpu(sts24->scsi_status) & SS_MASK; } else { @@ -860,7 +860,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt) fcport = sp->fcport; sense_len = rsp_info_len = resid_len = 0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { sense_len = le32_to_cpu(sts24->sense_len); rsp_info_len = le32_to_cpu(sts24->rsp_data_len); resid_len = le32_to_cpu(sts24->rsp_residual_count); @@ -878,7 +878,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt) /* Check for any FCP transport errors. */ if (scsi_status & SS_RESPONSE_INFO_LEN_VALID) { /* Sense data lies beyond any FCP RESPONSE data. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) sense_data += rsp_info_len; if (rsp_info_len > 3 && rsp_info[3]) { DEBUG2(printk("scsi(%ld:%d:%d:%d) FCP I/O protocol " @@ -1117,7 +1117,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt) case CS_TIMEOUT: cp->result = DID_BUS_BUSY << 16; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { DEBUG2(printk(KERN_INFO "scsi(%ld:%d:%d:%d): TIMEOUT status detected " "0x%x-0x%x\n", ha->host_no, cp->device->channel, @@ -1197,7 +1197,7 @@ qla2x00_status_cont_entry(scsi_qla_host_t *ha, sts_cont_entry_t *pkt) } /* Move sense data. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) host_to_fcp_swap(pkt->data, sizeof(pkt->data)); memcpy(sp->request_sense_ptr, pkt->data, sense_sz); DEBUG5(qla2x00_dump_buffer(sp->request_sense_ptr, sense_sz)); diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 584cc2f6dd35..267435f17482 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -91,7 +91,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp) spin_lock_irqsave(&ha->hardware_lock, flags); /* Load mailbox registers. */ - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) optr = (uint16_t __iomem *)®->isp24.mailbox0; else optr = (uint16_t __iomem *)MAILBOX_REG(ha, ®->isp, 0); @@ -155,7 +155,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp) set_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags); - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) WRT_REG_DWORD(®->isp24.hccr, HCCRX_SET_HOST_INT); else WRT_REG_WORD(®->isp.hccr, HCCR_SET_HOST_INT); @@ -179,7 +179,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp) DEBUG3_11(printk("%s(%ld): cmd=%x POLLING MODE.\n", __func__, ha->host_no, command);) - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) WRT_REG_DWORD(®->isp24.hccr, HCCRX_SET_HOST_INT); else WRT_REG_WORD(®->isp.hccr, HCCR_SET_HOST_INT); @@ -237,7 +237,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp) uint16_t mb0; uint32_t ictrl; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mb0 = RD_REG_WORD(®->isp24.mailbox0); ictrl = RD_REG_DWORD(®->isp24.ictrl); } else { @@ -334,7 +334,7 @@ qla2x00_load_ram(scsi_qla_host_t *ha, dma_addr_t req_dma, uint32_t risc_addr, DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no)); - if (MSW(risc_addr) || IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (MSW(risc_addr) || IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[0] = MBC_LOAD_RISC_RAM_EXTENDED; mcp->mb[8] = MSW(risc_addr); mcp->out_mb = MBX_8|MBX_0; @@ -348,7 +348,7 @@ qla2x00_load_ram(scsi_qla_host_t *ha, dma_addr_t req_dma, uint32_t risc_addr, mcp->mb[6] = MSW(MSD(req_dma)); mcp->mb[7] = LSW(MSD(req_dma)); mcp->out_mb |= MBX_7|MBX_6|MBX_3|MBX_2|MBX_1; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[4] = MSW(risc_code_size); mcp->mb[5] = LSW(risc_code_size); mcp->out_mb |= MBX_5|MBX_4; @@ -399,7 +399,7 @@ qla2x00_execute_fw(scsi_qla_host_t *ha, uint32_t risc_addr) mcp->mb[0] = MBC_EXECUTE_FIRMWARE; mcp->out_mb = MBX_0; mcp->in_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[1] = MSW(risc_addr); mcp->mb[2] = LSW(risc_addr); mcp->mb[3] = 0; @@ -422,7 +422,7 @@ qla2x00_execute_fw(scsi_qla_host_t *ha, uint32_t risc_addr) DEBUG2_3_11(printk("%s(%ld): failed=%x mb[0]=%x.\n", __func__, ha->host_no, rval, mcp->mb[0])); } else { - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { DEBUG11(printk("%s(%ld): done exchanges=%x.\n", __func__, ha->host_no, mcp->mb[1]);) } else { @@ -563,7 +563,7 @@ qla2x00_set_fw_options(scsi_qla_host_t *ha, uint16_t *fwopts) mcp->mb[3] = fwopts[3]; mcp->out_mb = MBX_3|MBX_2|MBX_1|MBX_0; mcp->in_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->in_mb |= MBX_1; } else { mcp->mb[10] = fwopts[10]; @@ -676,7 +676,7 @@ qla2x00_verify_checksum(scsi_qla_host_t *ha, uint32_t risc_addr) mcp->mb[0] = MBC_VERIFY_CHECKSUM; mcp->out_mb = MBX_0; mcp->in_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[1] = MSW(risc_addr); mcp->mb[2] = LSW(risc_addr); mcp->out_mb |= MBX_2|MBX_1; @@ -693,7 +693,7 @@ qla2x00_verify_checksum(scsi_qla_host_t *ha, uint32_t risc_addr) if (rval != QLA_SUCCESS) { DEBUG2_3_11(printk("%s(%ld): failed=%x chk sum=%x.\n", __func__, - ha->host_no, rval, (IS_QLA24XX(ha) || IS_QLA25XX(ha) ? + ha->host_no, rval, (IS_QLA24XX(ha) || IS_QLA54XX(ha) ? (mcp->mb[2] << 16) | mcp->mb[1]: mcp->mb[1]));) } else { DEBUG11(printk("%s(%ld): done.\n", __func__, ha->host_no);) @@ -751,7 +751,7 @@ qla2x00_issue_iocb(scsi_qla_host_t *ha, void* buffer, dma_addr_t phys_addr, /* Mask reserved bits. */ sts_entry->entry_status &= - IS_QLA24XX(ha) || IS_QLA25XX(ha) ? RF_MASK_24XX :RF_MASK; + IS_QLA24XX(ha) || IS_QLA54XX(ha) ? RF_MASK_24XX :RF_MASK; } return rval; @@ -1091,7 +1091,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt) memset(pd, 0, max(PORT_DATABASE_SIZE, PORT_DATABASE_24XX_SIZE)); mcp->mb[0] = MBC_GET_PORT_DATABASE; - if (opt != 0 && !IS_QLA24XX(ha) && !IS_QLA25XX(ha)) + if (opt != 0 && !IS_QLA24XX(ha) && !IS_QLA54XX(ha)) mcp->mb[0] = MBC_ENHANCED_GET_PORT_DATABASE; mcp->mb[2] = MSW(pd_dma); mcp->mb[3] = LSW(pd_dma); @@ -1099,7 +1099,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt) mcp->mb[7] = LSW(MSD(pd_dma)); mcp->out_mb = MBX_7|MBX_6|MBX_3|MBX_2|MBX_0; mcp->in_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[1] = fcport->loop_id; mcp->mb[10] = opt; mcp->out_mb |= MBX_10|MBX_1; @@ -1112,7 +1112,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt) mcp->mb[1] = fcport->loop_id << 8 | opt; mcp->out_mb |= MBX_1; } - mcp->buf_size = (IS_QLA24XX(ha) || IS_QLA25XX(ha) ? + mcp->buf_size = (IS_QLA24XX(ha) || IS_QLA54XX(ha) ? PORT_DATABASE_24XX_SIZE : PORT_DATABASE_SIZE); mcp->flags = MBX_DMA_IN; mcp->tov = (ha->login_timeout * 2) + (ha->login_timeout / 2); @@ -1120,7 +1120,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt) if (rval != QLA_SUCCESS) goto gpd_error_out; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { pd24 = (struct port_database_24xx *) pd; /* Check for logged in state. */ @@ -1337,7 +1337,7 @@ qla2x00_lip_reset(scsi_qla_host_t *ha) DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no);) - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[0] = MBC_LIP_FULL_LOGIN; mcp->mb[1] = BIT_0; mcp->mb[2] = 0xff; @@ -1866,7 +1866,7 @@ qla2x00_get_id_list(scsi_qla_host_t *ha, void *id_list, dma_addr_t id_list_dma, mcp->mb[0] = MBC_GET_ID_LIST; mcp->out_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[2] = MSW(id_list_dma); mcp->mb[3] = LSW(id_list_dma); mcp->mb[6] = MSW(MSD(id_list_dma)); @@ -2057,7 +2057,7 @@ qla2x00_get_link_status(scsi_qla_host_t *ha, uint16_t loop_id, mcp->mb[7] = LSW(MSD(stat_buf_dma)); mcp->out_mb = MBX_7|MBX_6|MBX_3|MBX_2|MBX_0; mcp->in_mb = MBX_0; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { mcp->mb[1] = loop_id; mcp->mb[4] = 0; mcp->mb[10] = 0; @@ -2324,7 +2324,7 @@ qla2x00_system_error(scsi_qla_host_t *ha) mbx_cmd_t mc; mbx_cmd_t *mcp = &mc; - if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) + if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) return QLA_FUNCTION_FAILED; DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no)); @@ -2434,7 +2434,7 @@ qla2x00_stop_firmware(scsi_qla_host_t *ha) mbx_cmd_t mc; mbx_cmd_t *mcp = &mc; - if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) + if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) return QLA_FUNCTION_FAILED; DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no)); diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 757c4c43c453..131614751196 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1183,11 +1183,11 @@ qla2x00_set_isp_flags(scsi_qla_host_t *ha) case PCI_DEVICE_ID_QLOGIC_ISP2432: ha->device_type |= DT_ISP2432; break; - case PCI_DEVICE_ID_QLOGIC_ISP2512: - ha->device_type |= DT_ISP2512; + case PCI_DEVICE_ID_QLOGIC_ISP5422: + ha->device_type |= DT_ISP5422; break; - case PCI_DEVICE_ID_QLOGIC_ISP2522: - ha->device_type |= DT_ISP2522; + case PCI_DEVICE_ID_QLOGIC_ISP5432: + ha->device_type |= DT_ISP5432; break; } } @@ -1433,7 +1433,7 @@ int qla2x00_probe_one(struct pci_dev *pdev, struct qla_board_info *brd_info) ha->gid_list_info_size = 6; if (IS_QLA2322(ha) || IS_QLA6322(ha)) ha->optrom_size = OPTROM_SIZE_2322; - } else if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + } else if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { host->max_id = MAX_TARGETS_2200; ha->mbx_count = MAILBOX_REGISTER_COUNT; ha->request_q_length = REQUEST_ENTRY_CNT_24XX; @@ -1559,7 +1559,7 @@ int qla2x00_probe_one(struct pci_dev *pdev, struct qla_board_info *brd_info) spin_lock_irqsave(&ha->hardware_lock, flags); reg = ha->iobase; - if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) { + if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { WRT_REG_DWORD(®->isp24.hccr, HCCRX_CLR_HOST_INT); WRT_REG_DWORD(®->isp24.hccr, HCCRX_CLR_RISC_INT); } else { @@ -2631,7 +2631,7 @@ qla2x00_request_firmware(scsi_qla_host_t *ha) blob = &qla_fw_blobs[FW_ISP2322]; } else if (IS_QLA6312(ha) || IS_QLA6322(ha)) { blob = &qla_fw_blobs[FW_ISP63XX]; - } else if (IS_QLA24XX(ha)) { + } else if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) { blob = &qla_fw_blobs[FW_ISP24XX]; } @@ -2687,6 +2687,10 @@ static struct pci_device_id qla2xxx_pci_tbl[] = { PCI_ANY_ID, PCI_ANY_ID, }, { PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP2432, PCI_ANY_ID, PCI_ANY_ID, }, + { PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP5422, + PCI_ANY_ID, PCI_ANY_ID, }, + { PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP5432, + PCI_ANY_ID, PCI_ANY_ID, }, { 0 }, }; MODULE_DEVICE_TABLE(pci, qla2xxx_pci_tbl); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 82b83da25d77..1afac931351e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -852,6 +852,8 @@ #define PCI_DEVICE_ID_QLOGIC_ISP2432 0x2432 #define PCI_DEVICE_ID_QLOGIC_ISP2512 0x2512 #define PCI_DEVICE_ID_QLOGIC_ISP2522 0x2522 +#define PCI_DEVICE_ID_QLOGIC_ISP5422 0x5422 +#define PCI_DEVICE_ID_QLOGIC_ISP5432 0x5432 #define PCI_VENDOR_ID_CYRIX 0x1078 #define PCI_DEVICE_ID_CYRIX_5510 0x0000 -- cgit v1.2.3-71-gd317 From e935d5da8e5d12fabe5b632736c50eae0427e8c8 Mon Sep 17 00:00:00 2001 From: "Moore, Eric" Date: Tue, 14 Mar 2006 09:18:18 -0700 Subject: [SCSI] drivers/base/bus.c - export reprobe Adding support for exposing hidden raid components for sg interface. The sdev->no_uld_attach flag will set set accordingly. The sas module supports adding/removing raid volumes using online storage management application interface. This patch was provided to me by Christoph Hellwig. Signed-off-by: Eric Moore Signed-off-by: Greg Kroah-Hartman Signed-off-by: James Bottomley --- drivers/base/bus.c | 22 ++++++++++++++++++++++ include/linux/device.h | 1 + 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index c3141565d59d..48718b7f4fa0 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -536,6 +536,28 @@ void bus_rescan_devices(struct bus_type * bus) bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } +/** + * device_reprobe - remove driver for a device and probe for a new driver + * @dev: the device to reprobe + * + * This function detaches the attached driver (if any) for the given + * device and restarts the driver probing process. It is intended + * to use if probing criteria changed during a devices lifetime and + * driver attachment should change accordingly. + */ +void device_reprobe(struct device *dev) +{ + if (dev->driver) { + if (dev->parent) /* Needed for USB */ + down(&dev->parent->sem); + device_release_driver(dev); + if (dev->parent) + up(&dev->parent->sem); + } + + bus_rescan_devices_helper(dev, NULL); +} +EXPORT_SYMBOL_GPL(device_reprobe); struct bus_type * get_bus(struct bus_type * bus) { diff --git a/include/linux/device.h b/include/linux/device.h index 58df18d9cd3e..e8ac5bcfbec7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -378,6 +378,7 @@ extern void device_bind_driver(struct device * dev); extern void device_release_driver(struct device * dev); extern int device_attach(struct device * dev); extern void driver_attach(struct device_driver * drv); +extern void device_reprobe(struct device *dev); /* -- cgit v1.2.3-71-gd317 From 30afc84cf7325e88fb9746340eba3c161080ff49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 18 Mar 2006 18:40:14 +0900 Subject: [SCSI] libata: implement minimal transport template for ->eh_timed_out SCSI midlayer has moved hostt->eh_timed_out to transport template. As libata doesn't need full-blown transport support yet, implement minimal transport for libata. No transport class or whatsoever, just empty transport template with ->eh_timed_out hook. Signed-off-by: Tejun Heo Signed-off-by: James Bottomley --- drivers/scsi/ahci.c | 1 - drivers/scsi/ata_piix.c | 1 - drivers/scsi/libata-core.c | 3 ++- drivers/scsi/libata-scsi.c | 10 ++++++++++ drivers/scsi/libata.h | 2 ++ drivers/scsi/pdc_adma.c | 1 - drivers/scsi/sata_mv.c | 1 - drivers/scsi/sata_nv.c | 1 - drivers/scsi/sata_promise.c | 1 - drivers/scsi/sata_qstor.c | 1 - drivers/scsi/sata_sil.c | 1 - drivers/scsi/sata_sil24.c | 1 - drivers/scsi/sata_sis.c | 1 - drivers/scsi/sata_svw.c | 1 - drivers/scsi/sata_sx4.c | 1 - drivers/scsi/sata_uli.c | 1 - drivers/scsi/sata_via.c | 1 - drivers/scsi/sata_vsc.c | 1 - include/linux/libata.h | 1 - 19 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c index e97ab3e6de4d..a1ddbba2cbdf 100644 --- a/drivers/scsi/ahci.c +++ b/drivers/scsi/ahci.c @@ -207,7 +207,6 @@ static struct scsi_host_template ahci_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c index 9327b62f97de..a74e23d39ba9 100644 --- a/drivers/scsi/ata_piix.c +++ b/drivers/scsi/ata_piix.c @@ -209,7 +209,6 @@ static struct scsi_host_template piix_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index 714b42bad935..64dce00e9c46 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -4653,6 +4653,8 @@ static struct ata_port * ata_host_add(const struct ata_probe_ent *ent, if (!host) return NULL; + host->transportt = &ata_scsi_transport_template; + ap = (struct ata_port *) &host->hostdata[0]; ata_host_init(ap, host, host_set, ent, port_no); @@ -5084,7 +5086,6 @@ EXPORT_SYMBOL_GPL(ata_busy_sleep); EXPORT_SYMBOL_GPL(ata_port_queue_task); EXPORT_SYMBOL_GPL(ata_scsi_ioctl); EXPORT_SYMBOL_GPL(ata_scsi_queuecmd); -EXPORT_SYMBOL_GPL(ata_scsi_timed_out); EXPORT_SYMBOL_GPL(ata_scsi_error); EXPORT_SYMBOL_GPL(ata_scsi_slave_config); EXPORT_SYMBOL_GPL(ata_scsi_release); diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c index ccedb4536977..bd9f2176f79a 100644 --- a/drivers/scsi/libata-scsi.c +++ b/drivers/scsi/libata-scsi.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ typedef unsigned int (*ata_xlat_func_t)(struct ata_queued_cmd *qc, const u8 *scsicmd); static struct ata_device * ata_scsi_find_dev(struct ata_port *ap, const struct scsi_device *scsidev); +enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd); #define RW_RECOVERY_MPAGE 0x1 #define RW_RECOVERY_MPAGE_LEN 12 @@ -92,6 +94,14 @@ static const u8 def_control_mpage[CONTROL_MPAGE_LEN] = { 0, 30 /* extended self test time, see 05-359r1 */ }; +/* + * libata transport template. libata doesn't do real transport stuff. + * It just needs the eh_timed_out hook. + */ +struct scsi_transport_template ata_scsi_transport_template = { + .eh_timed_out = ata_scsi_timed_out, +}; + static void ata_scsi_invalid_field(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h index f4c48c91b63d..65f52beea884 100644 --- a/drivers/scsi/libata.h +++ b/drivers/scsi/libata.h @@ -57,6 +57,8 @@ extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg); /* libata-scsi.c */ +extern struct scsi_transport_template ata_scsi_transport_template; + extern void ata_scsi_scan_host(struct ata_port *ap); extern int ata_scsi_error(struct Scsi_Host *host); extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf, diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c index 5f33cc932e70..b3dc5f85ae0b 100644 --- a/drivers/scsi/pdc_adma.c +++ b/drivers/scsi/pdc_adma.c @@ -143,7 +143,6 @@ static struct scsi_host_template adma_ata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c index e561281967dd..874c5be0843c 100644 --- a/drivers/scsi/sata_mv.c +++ b/drivers/scsi/sata_mv.c @@ -378,7 +378,6 @@ static struct scsi_host_template mv_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = MV_USE_Q_DEPTH, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c index caffadc2e0ae..e5b20c6afc18 100644 --- a/drivers/scsi/sata_nv.c +++ b/drivers/scsi/sata_nv.c @@ -229,7 +229,6 @@ static struct scsi_host_template nv_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 84cb3940ad88..cc928c68a479 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -111,7 +111,6 @@ static struct scsi_host_template pdc_ata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c index 9602f43a298e..9ffe1ef0d205 100644 --- a/drivers/scsi/sata_qstor.c +++ b/drivers/scsi/sata_qstor.c @@ -132,7 +132,6 @@ static struct scsi_host_template qs_ata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c index 4f2a67ed39d8..3e75d6733239 100644 --- a/drivers/scsi/sata_sil.c +++ b/drivers/scsi/sata_sil.c @@ -146,7 +146,6 @@ static struct scsi_host_template sil_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c index 9a53a5ed38c5..5d01e5ce5ac5 100644 --- a/drivers/scsi/sata_sil24.c +++ b/drivers/scsi/sata_sil24.c @@ -281,7 +281,6 @@ static struct scsi_host_template sil24_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c index 7fd45f86de99..acc8439dea23 100644 --- a/drivers/scsi/sata_sis.c +++ b/drivers/scsi/sata_sis.c @@ -87,7 +87,6 @@ static struct scsi_host_template sis_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c index 4aaccd53e736..051e47d975ca 100644 --- a/drivers/scsi/sata_svw.c +++ b/drivers/scsi/sata_svw.c @@ -288,7 +288,6 @@ static struct scsi_host_template k2_sata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c index 9f8a76815402..ae70f60c7c0d 100644 --- a/drivers/scsi/sata_sx4.c +++ b/drivers/scsi/sata_sx4.c @@ -182,7 +182,6 @@ static struct scsi_host_template pdc_sata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c index 37a487b7d655..8f5025733def 100644 --- a/drivers/scsi/sata_uli.c +++ b/drivers/scsi/sata_uli.c @@ -75,7 +75,6 @@ static struct scsi_host_template uli_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c index ff65a0b0457f..791bf652ba63 100644 --- a/drivers/scsi/sata_via.c +++ b/drivers/scsi/sata_via.c @@ -94,7 +94,6 @@ static struct scsi_host_template svia_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c index b574379a7a82..ee75b9b38ae8 100644 --- a/drivers/scsi/sata_vsc.c +++ b/drivers/scsi/sata_vsc.c @@ -251,7 +251,6 @@ static struct scsi_host_template vsc_sata_sht = { .name = DRV_NAME, .ioctl = ata_scsi_ioctl, .queuecommand = ata_scsi_queuecmd, - .eh_timed_out = ata_scsi_timed_out, .eh_strategy_handler = ata_scsi_error, .can_queue = ATA_DEF_QUEUE, .this_id = ATA_SHT_THIS_ID, diff --git a/include/linux/libata.h b/include/linux/libata.h index 239408ecfddf..204c37a55f06 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -508,7 +508,6 @@ extern void ata_host_set_remove(struct ata_host_set *host_set); extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg); extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); -extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd); extern int ata_scsi_error(struct Scsi_Host *host); extern void ata_eh_qc_complete(struct ata_queued_cmd *qc); extern void ata_eh_qc_retry(struct ata_queued_cmd *qc); -- cgit v1.2.3-71-gd317 From 4de151d8cd2553e7e89044ab5d72fcad4eb04afb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 22 Mar 2006 00:13:35 +0100 Subject: It's UTF-8 Fix some comments to "UTF-8". Signed-off-by: Alexey Dobriyan Signed-off-by: Adrian Bunk --- Documentation/filesystems/isofs.txt | 4 ++-- Documentation/filesystems/jfs.txt | 2 +- Documentation/filesystems/vfat.txt | 6 +++--- fs/befs/linuxvfs.c | 2 +- fs/cifs/CHANGES | 2 +- fs/fat/dir.c | 2 +- fs/fat/inode.c | 2 +- fs/isofs/joliet.c | 2 +- fs/nls/Kconfig | 2 +- include/asm-mips/termbits.h | 2 +- include/linux/msdos_fs.h | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt index 424585ff6ea1..758e50401c16 100644 --- a/Documentation/filesystems/isofs.txt +++ b/Documentation/filesystems/isofs.txt @@ -9,9 +9,9 @@ when using discs encoded using Microsoft's Joliet extensions. iocharset=name Character set to use for converting from Unicode to ASCII. Joliet filenames are stored in Unicode format, but Unix for the most part doesn't know how to deal with Unicode. - There is also an option of doing UTF8 translations with the + There is also an option of doing UTF-8 translations with the utf8 option. - utf8 Encode Unicode names in UTF8 format. Default is no. + utf8 Encode Unicode names in UTF-8 format. Default is no. Mount options unique to the isofs filesystem. block=512 Set the block size for the disk to 512 bytes diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt index 3e992daf99ad..bae128663748 100644 --- a/Documentation/filesystems/jfs.txt +++ b/Documentation/filesystems/jfs.txt @@ -6,7 +6,7 @@ The following mount options are supported: iocharset=name Character set to use for converting from Unicode to ASCII. The default is to do no conversion. Use - iocharset=utf8 for UTF8 translations. This requires + iocharset=utf8 for UTF-8 translations. This requires CONFIG_NLS_UTF8 to be set in the kernel .config file. iocharset=none specifies the default behavior explicitly. diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index 5ead20c6c744..2001abbc60e6 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -28,16 +28,16 @@ iocharset=name -- Character set to use for converting between the know how to deal with Unicode. By default, FAT_DEFAULT_IOCHARSET setting is used. - There is also an option of doing UTF8 translations + There is also an option of doing UTF-8 translations with the utf8 option. NOTE: "iocharset=utf8" is not recommended. If unsure, you should consider the following option instead. -utf8= -- UTF8 is the filesystem safe version of Unicode that +utf8= -- UTF-8 is the filesystem safe version of Unicode that is used by the console. It can be be enabled for the filesystem with this option. If 'uni_xlate' gets set, - UTF8 gets disabled. + UTF-8 gets disabled. uni_xlate= -- Translate unhandled Unicode characters to special escaped sequences. This would let you backup and diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2d365cb8eec6..dd6048ce0532 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -561,7 +561,7 @@ befs_utf2nls(struct super_block *sb, const char *in, * @sb: Superblock * @src: Input string buffer in NLS format * @srclen: Length of input string in bytes - * @dest: The output string in UTF8 format + * @dest: The output string in UTF-8 format * @destlen: Length of the output buffer * * Converts input string @src, which is in the format of the loaded NLS map, diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index d335015473a5..cb68efba35db 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -160,7 +160,7 @@ improperly zeroed buffer in CIFS Unix extensions set times call. Version 1.25 ------------ Fix internationalization problem in cifs readdir with filenames that map to -longer UTF8 strings than the string on the wire was in Unicode. Add workaround +longer UTF-8 strings than the string on the wire was in Unicode. Add workaround for readdir to netapp servers. Fix search rewind (seek into readdir to return non-consecutive entries). Do not do readdir when server negotiates buffer size to small to fit filename. Add support for reading POSIX ACLs from diff --git a/fs/fat/dir.c b/fs/fat/dir.c index db0de5c621c7..4095bc149eb1 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -114,7 +114,7 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, } /* - * Convert Unicode 16 to UTF8, translated Unicode, or ASCII. + * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII. * If uni_xlate is enabled and we can't get a 1:1 conversion, use a * colon as an escape character since it is normally invalid on the vfat * filesystem. The following four characters are the hexadecimal digits diff --git a/fs/fat/inode.c b/fs/fat/inode.c index e7f4aa7fc686..e78d7b4842cc 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1101,7 +1101,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, return -EINVAL; } } - /* UTF8 doesn't provide FAT semantics */ + /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be case sensitive!\n"); diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index 2931de7f1a6a..81a90e170ac3 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -11,7 +11,7 @@ #include "isofs.h" /* - * Convert Unicode 16 to UTF8 or ASCII. + * Convert Unicode 16 to UTF-8 or ASCII. */ static int uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls) diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig index 0ab8f00bdbb2..976ecccd6f56 100644 --- a/fs/nls/Kconfig +++ b/fs/nls/Kconfig @@ -491,7 +491,7 @@ config NLS_KOI8_U (koi8-u) and Belarusian (koi8-ru) character sets. config NLS_UTF8 - tristate "NLS UTF8" + tristate "NLS UTF-8" depends on NLS help If you want to display filenames with native language characters diff --git a/include/asm-mips/termbits.h b/include/asm-mips/termbits.h index c29c65b7818e..fa6d04dac56b 100644 --- a/include/asm-mips/termbits.h +++ b/include/asm-mips/termbits.h @@ -77,7 +77,7 @@ struct termios { #define IXANY 0004000 /* Any character will restart after stop. */ #define IXOFF 0010000 /* Enable start/stop input control. */ #define IMAXBEL 0020000 /* Ring bell when input queue is full. */ -#define IUTF8 0040000 /* Input is UTF8 */ +#define IUTF8 0040000 /* Input is UTF-8 */ /* c_oflag bits */ #define OPOST 0000001 /* Perform output processing. */ diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index e933e2a355ad..8bcd9450d926 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -199,7 +199,7 @@ struct fat_mount_options { sys_immutable:1, /* set = system files are immutable */ dotsOK:1, /* set = hidden and system files are named '.filename' */ isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ - utf8:1, /* Use of UTF8 character set (Default) */ + utf8:1, /* Use of UTF-8 character set (Default) */ unicode_xlate:1, /* create escape sequences for unhandled Unicode */ numtail:1, /* Does first alias have a numeric '~1' type tail? */ atari:1, /* Use Atari GEMDOS variation of MS-DOS fs */ -- cgit v1.2.3-71-gd317 From 116f232b3794a8b6ebde21aef5004b18cc1cfa86 Mon Sep 17 00:00:00 2001 From: Rytchkov Alexey Date: Wed, 22 Mar 2006 00:58:53 +0100 Subject: fixed path to moved file in include/linux/device.h Signed-off-by: Adrian Bunk --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 5b595fdfb672..10c1693a2529 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -399,7 +399,7 @@ extern struct device * get_device(struct device * dev); extern void put_device(struct device * dev); -/* drivers/base/power.c */ +/* drivers/base/power/shutdown.c */ extern void device_shutdown(void); -- cgit v1.2.3-71-gd317 From 89bbfc95d65839d6ae23ddab8a3cc5af4ae88383 Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Tue, 21 Mar 2006 23:58:08 -0800 Subject: [NET]: allow 32 bit socket ioctl in 64 bit kernel Since the register_ioctl32_conversion() patch in the kernel is now obsolete, provide another method to allow 32 bit user space ioctls to reach the kernel. Signed-off-by: Shaun Pereira Acked-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/net.h | 6 ++++++ net/socket.c | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 152fa6551fd8..84a490e5f0a1 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -143,6 +143,8 @@ struct proto_ops { struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); + int (*compat_ioctl) (struct socket *sock, unsigned int cmd, + unsigned long arg); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, @@ -251,6 +253,8 @@ SOCKCALL_UWRAP(name, poll, (struct file *file, struct socket *sock, struct poll_ (file, sock, wait)) \ SOCKCALL_WRAP(name, ioctl, (struct socket *sock, unsigned int cmd, \ unsigned long arg), (sock, cmd, arg)) \ +SOCKCALL_WRAP(name, compat_ioctl, (struct socket *sock, unsigned int cmd, \ + unsigned long arg), (sock, cmd, arg)) \ SOCKCALL_WRAP(name, listen, (struct socket *sock, int len), (sock, len)) \ SOCKCALL_WRAP(name, shutdown, (struct socket *sock, int flags), (sock, flags)) \ SOCKCALL_WRAP(name, setsockopt, (struct socket *sock, int level, int optname, \ @@ -275,6 +279,7 @@ static const struct proto_ops name##_ops = { \ .getname = __lock_##name##_getname, \ .poll = __lock_##name##_poll, \ .ioctl = __lock_##name##_ioctl, \ + .compat_ioctl = __lock_##name##_compat_ioctl, \ .listen = __lock_##name##_listen, \ .shutdown = __lock_##name##_shutdown, \ .setsockopt = __lock_##name##_setsockopt, \ @@ -283,6 +288,7 @@ static const struct proto_ops name##_ops = { \ .recvmsg = __lock_##name##_recvmsg, \ .mmap = __lock_##name##_mmap, \ }; + #endif #define MODULE_ALIAS_NETPROTO(proto) \ diff --git a/net/socket.c b/net/socket.c index e3c21d5ec288..e2d5bae994de 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,6 +107,10 @@ static unsigned int sock_poll(struct file *file, struct poll_table_struct *wait); static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#ifdef CONFIG_COMPAT +static long compat_sock_ioctl(struct file *file, + unsigned int cmd, unsigned long arg); +#endif static int sock_fasync(int fd, struct file *filp, int on); static ssize_t sock_readv(struct file *file, const struct iovec *vector, unsigned long count, loff_t *ppos); @@ -128,6 +132,9 @@ static struct file_operations socket_file_ops = { .aio_write = sock_aio_write, .poll = sock_poll, .unlocked_ioctl = sock_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_sock_ioctl, +#endif .mmap = sock_mmap, .open = sock_no_open, /* special open code to disallow open via /proc */ .release = sock_close, @@ -2136,6 +2143,20 @@ void socket_seq_show(struct seq_file *seq) } #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_COMPAT +static long compat_sock_ioctl(struct file *file, unsigned cmd, + unsigned long arg) +{ + struct socket *sock = file->private_data; + int ret = -ENOIOCTLCMD; + + if (sock->ops->compat_ioctl) + ret = sock->ops->compat_ioctl(sock, cmd, arg); + + return ret; +} +#endif + /* ABI emulation layers need these two */ EXPORT_SYMBOL(move_addr_to_kernel); EXPORT_SYMBOL(move_addr_to_user); -- cgit v1.2.3-71-gd317 From a64b7b936dcd926ace745c07c14f45ecfaddb034 Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Mar 2006 00:01:31 -0800 Subject: [X25]: allow ITU-T DTE facilities for x25 Allows use of the optional user facility to insert ITU-T (http://www.itu.int/ITU-T/) specified DTE facilities in call set-up x25 packets. This feature is optional; no facilities will be added if the ioctl is not used, and call setup packet remains the same as before. If the ioctls provided by the patch are used, then a facility marker will be added to the x25 packet header so that the called dte address extension facility can be differentiated from other types of facilities (as described in the ITU-T X.25 recommendation) that are also allowed in the x25 packet header. Facility markers are made up of two octets, and may be present in the x25 packet headers of call-request, incoming call, call accepted, clear request, and clear indication packets. The first of the two octets represents the facility code field and is set to zero by this patch. The second octet of the marker represents the facility parameter field and is set to 0x0F because the marker will be inserted before ITU-T type DTE facilities. Since according to ITU-T X.25 Recommendation X.25(10/96)- 7.1 "All networks will support the facility markers with a facility parameter field set to all ones or to 00001111", therefore this patch should work with all x.25 networks. While there are many ITU-T DTE facilities, this patch implements only the called and calling address extension, with placeholders in the x25_dte_facilities structure for the rest of the facilities. Testing: This patch was tested using a cisco xot router connected on its serial ports to an X.25 network, and on its lan ports to a host running an xotd daemon. It is also possible to test this patch using an xotd daemon and an x25tap patch, where the xotd daemons work back-to-back without actually using an x.25 network. See www.fyonne.net for details on how to do this. Signed-off-by: Shaun Pereira Acked-by: Andrew Hendry Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 26 +++++++++++++++ include/net/x25.h | 21 ++++++++++--- net/x25/af_x25.c | 45 +++++++++++++++++++++++++- net/x25/x25_facilities.c | 82 +++++++++++++++++++++++++++++++++++++++++------- net/x25/x25_in.c | 3 +- net/x25/x25_subr.c | 6 ++-- 6 files changed, 163 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/x25.h b/include/linux/x25.h index 16d44931afa0..d035e4e87d07 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -11,6 +11,8 @@ #ifndef X25_KERNEL_H #define X25_KERNEL_H +#include + #define SIOCX25GSUBSCRIP (SIOCPROTOPRIVATE + 0) #define SIOCX25SSUBSCRIP (SIOCPROTOPRIVATE + 1) #define SIOCX25GFACILITIES (SIOCPROTOPRIVATE + 2) @@ -21,6 +23,8 @@ #define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) #define SIOCX25CALLACCPTAPPRV (SIOCPROTOPRIVATE + 8) #define SIOCX25SENDCALLACCPT (SIOCPROTOPRIVATE + 9) +#define SIOCX25GDTEFACILITIES (SIOCPROTOPRIVATE + 10) +#define SIOCX25SDTEFACILITIES (SIOCPROTOPRIVATE + 11) /* * Values for {get,set}sockopt. @@ -77,6 +81,8 @@ struct x25_subscrip_struct { #define X25_MASK_PACKET_SIZE 0x04 #define X25_MASK_WINDOW_SIZE 0x08 +#define X25_MASK_CALLING_AE 0x10 +#define X25_MASK_CALLED_AE 0x20 /* @@ -98,6 +104,26 @@ struct x25_facilities { unsigned int reverse; }; +/* +* ITU DTE facilities +* Only the called and calling address +* extension are currently implemented. +* The rest are in place to avoid the struct +* changing size if someone needs them later +*/ + +struct x25_dte_facilities { + __u16 delay_cumul; + __u16 delay_target; + __u16 delay_max; + __u8 min_throughput; + __u8 expedited; + __u8 calling_len; + __u8 called_len; + __u8 calling_ae[20]; + __u8 called_ae[20]; +}; + /* * Call User Data structure. */ diff --git a/include/net/x25.h b/include/net/x25.h index fee62ff8c194..0ad90ebcf86e 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -101,9 +101,17 @@ enum { #define X25_FAC_PACKET_SIZE 0x42 #define X25_FAC_WINDOW_SIZE 0x43 -#define X25_MAX_FAC_LEN 20 /* Plenty to spare */ +#define X25_MAX_FAC_LEN 60 #define X25_MAX_CUD_LEN 128 +#define X25_FAC_CALLING_AE 0xCB +#define X25_FAC_CALLED_AE 0xC9 + +#define X25_MARKER 0x00 +#define X25_DTE_SERVICES 0x0F +#define X25_MAX_AE_LEN 40 /* Max num of semi-octets in AE - OSI Nw */ +#define X25_MAX_DTE_FACIL_LEN 21 /* Max length of DTE facility params */ + /** * struct x25_route - x25 routing entry * @node - entry in x25_list_lock @@ -148,6 +156,7 @@ struct x25_sock { struct timer_list timer; struct x25_causediag causediag; struct x25_facilities facilities; + struct x25_dte_facilities dte_facilities; struct x25_calluserdata calluserdata; unsigned long vc_facil_mask; /* inc_call facilities mask */ }; @@ -180,9 +189,13 @@ extern void x25_establish_link(struct x25_neigh *); extern void x25_terminate_link(struct x25_neigh *); /* x25_facilities.c */ -extern int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, unsigned long *); -extern int x25_create_facilities(unsigned char *, struct x25_facilities *, unsigned long); -extern int x25_negotiate_facilities(struct sk_buff *, struct sock *, struct x25_facilities *); +extern int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, + struct x25_dte_facilities *, unsigned long *); +extern int x25_create_facilities(unsigned char *, struct x25_facilities *, + struct x25_dte_facilities *, unsigned long); +extern int x25_negotiate_facilities(struct sk_buff *, struct sock *, + struct x25_facilities *, + struct x25_dte_facilities *); extern void x25_limit_facilities(struct x25_facilities *, struct x25_neigh *); /* x25_in.c */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 03725c051752..7bf93df7248b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -525,6 +525,13 @@ static int x25_create(struct socket *sock, int protocol) x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = X25_DEFAULT_THROUGHPUT; x25->facilities.reverse = X25_DEFAULT_REVERSE; + x25->dte_facilities.calling_len = 0; + x25->dte_facilities.called_len = 0; + memset(x25->dte_facilities.called_ae, '\0', + sizeof(x25->dte_facilities.called_ae)); + memset(x25->dte_facilities.calling_ae, '\0', + sizeof(x25->dte_facilities.calling_ae)); + rc = 0; out: return rc; @@ -561,6 +568,7 @@ static struct sock *x25_make_new(struct sock *osk) x25->t2 = ox25->t2; x25->facilities = ox25->facilities; x25->qbitincl = ox25->qbitincl; + x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; x25->accptapprv = ox25->accptapprv; @@ -840,6 +848,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; + struct x25_dte_facilities dte_facilities; int len, rc; /* @@ -876,7 +885,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, /* * Try to reach a compromise on the requested facilities. */ - if ((len = x25_negotiate_facilities(skb, sk, &facilities)) == -1) + len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); + if (len == -1) goto out_sock_put; /* @@ -907,9 +917,12 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, makex25->source_addr = source_addr; makex25->neighbour = nb; makex25->facilities = facilities; + makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; + /* ensure no calling address extension on accept */ + makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediatly */ @@ -1316,6 +1329,36 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } + case SIOCX25GDTEFACILITIES: { + rc = copy_to_user(argp, &x25->dte_facilities, + sizeof(x25->dte_facilities)); + if (rc) + rc = -EFAULT; + break; + } + + case SIOCX25SDTEFACILITIES: { + struct x25_dte_facilities dtefacs; + rc = -EFAULT; + if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) + break; + rc = -EINVAL; + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE) + break; + if (dtefacs.calling_len > X25_MAX_AE_LEN) + break; + if (dtefacs.calling_ae == NULL) + break; + if (dtefacs.called_len > X25_MAX_AE_LEN) + break; + if (dtefacs.called_ae == NULL) + break; + x25->dte_facilities = dtefacs; + rc = 0; + break; + } + case SIOCX25GCALLUSERDATA: { struct x25_calluserdata cud = x25->calluserdata; rc = copy_to_user(argp, &cud, diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 54278b962f4c..9f42b9c9de37 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -28,18 +28,28 @@ #include /* - * Parse a set of facilities into the facilities structure. Unrecognised + * Parse a set of facilities into the facilities structures. Unrecognised * facilities are written to the debug log file. */ -int x25_parse_facilities(struct sk_buff *skb, - struct x25_facilities *facilities, - unsigned long *vc_fac_mask) +int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities, + struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask) { unsigned char *p = skb->data; unsigned int len = *p++; *vc_fac_mask = 0; + /* + * The kernel knows which facilities were set on an incoming call but + * currently this information is not available to userspace. Here we + * give userspace who read incoming call facilities 0 length to indicate + * it wasn't set. + */ + dte_facs->calling_len = 0; + dte_facs->called_len = 0; + memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae)); + memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae)); + while (len > 0) { switch (*p & X25_FAC_CLASS_MASK) { case X25_FAC_CLASS_A: @@ -74,6 +84,8 @@ int x25_parse_facilities(struct sk_buff *skb, facilities->throughput = p[1]; *vc_fac_mask |= X25_MASK_THROUGHPUT; break; + case X25_MARKER: + break; default: printk(KERN_DEBUG "X.25: unknown facility " "%02X, value %02X\n", @@ -112,11 +124,30 @@ int x25_parse_facilities(struct sk_buff *skb, len -= 4; break; case X25_FAC_CLASS_D: - printk(KERN_DEBUG "X.25: unknown facility %02X, " - "length %d, values %02X, %02X, %02X, %02X\n", - p[0], p[1], p[2], p[3], p[4], p[5]); + switch (*p) { + case X25_FAC_CALLING_AE: + if (p[1] > X25_MAX_DTE_FACIL_LEN) + break; + dte_facs->calling_len = p[2]; + memcpy(dte_facs->calling_ae, &p[3], p[1] - 1); + *vc_fac_mask |= X25_MASK_CALLING_AE; + break; + case X25_FAC_CALLED_AE: + if (p[1] > X25_MAX_DTE_FACIL_LEN) + break; + dte_facs->called_len = p[2]; + memcpy(dte_facs->called_ae, &p[3], p[1] - 1); + *vc_fac_mask |= X25_MASK_CALLED_AE; + break; + default: + printk(KERN_DEBUG "X.25: unknown facility %02X," + "length %d, values %02X, %02X, " + "%02X, %02X\n", + p[0], p[1], p[2], p[3], p[4], p[5]); + break; + } len -= p[1] + 2; - p += p[1] + 2; + p += p[1] + 2; break; } } @@ -128,8 +159,8 @@ int x25_parse_facilities(struct sk_buff *skb, * Create a set of facilities. */ int x25_create_facilities(unsigned char *buffer, - struct x25_facilities *facilities, - unsigned long facil_mask) + struct x25_facilities *facilities, + struct x25_dte_facilities *dte_facs, unsigned long facil_mask) { unsigned char *p = buffer + 1; int len; @@ -168,6 +199,33 @@ int x25_create_facilities(unsigned char *buffer, *p++ = facilities->winsize_out ? : facilities->winsize_in; } + if (facil_mask & (X25_MASK_CALLING_AE|X25_MASK_CALLED_AE)) { + *p++ = X25_MARKER; + *p++ = X25_DTE_SERVICES; + } + + if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) { + unsigned bytecount = (dte_facs->calling_len % 2) ? + dte_facs->calling_len / 2 + 1 : + dte_facs->calling_len / 2; + *p++ = X25_FAC_CALLING_AE; + *p++ = 1 + bytecount; + *p++ = dte_facs->calling_len; + memcpy(p, dte_facs->calling_ae, bytecount); + p += bytecount; + } + + if (dte_facs->called_len && (facil_mask & X25_MASK_CALLED_AE)) { + unsigned bytecount = (dte_facs->called_len % 2) ? + dte_facs->called_len / 2 + 1 : + dte_facs->called_len / 2; + *p++ = X25_FAC_CALLED_AE; + *p++ = 1 + bytecount; + *p++ = dte_facs->called_len; + memcpy(p, dte_facs->called_ae, bytecount); + p+=bytecount; + } + len = p - buffer; buffer[0] = len - 1; @@ -180,7 +238,7 @@ int x25_create_facilities(unsigned char *buffer, * The only real problem is with reverse charging. */ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, - struct x25_facilities *new) + struct x25_facilities *new, struct x25_dte_facilities *dte) { struct x25_sock *x25 = x25_sk(sk); struct x25_facilities *ours = &x25->facilities; @@ -190,7 +248,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, memset(&theirs, 0, sizeof(theirs)); memcpy(new, ours, sizeof(*new)); - len = x25_parse_facilities(skb, &theirs, &x25->vc_facil_mask); + len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask); /* * They want reverse charging, we won't accept it. diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 26146874b839..eed50e10f09b 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -106,7 +106,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp skb_pull(skb, x25_addr_ntoa(skb->data, &source_addr, &dest_addr)); skb_pull(skb, x25_parse_facilities(skb, &x25->facilities, - &x25->vc_facil_mask)); + &x25->dte_facilities, + &x25->vc_facil_mask)); /* * Copy any Call User Data. */ diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 8be9b8fbc24d..8d6220aa5d0f 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -190,8 +190,9 @@ void x25_write_internal(struct sock *sk, int frametype) dptr = skb_put(skb, len); memcpy(dptr, addresses, len); len = x25_create_facilities(facilities, - &x25->facilities, - x25->neighbour->global_facil_mask); + &x25->facilities, + &x25->dte_facilities, + x25->neighbour->global_facil_mask); dptr = skb_put(skb, len); memcpy(dptr, facilities, len); dptr = skb_put(skb, x25->calluserdata.cudlength); @@ -206,6 +207,7 @@ void x25_write_internal(struct sock *sk, int frametype) *dptr++ = 0x00; /* Address lengths */ len = x25_create_facilities(facilities, &x25->facilities, + &x25->dte_facilities, x25->vc_facil_mask); dptr = skb_put(skb, len); memcpy(dptr, facilities, len); -- cgit v1.2.3-71-gd317 From 9d2f928ddf64ca0361562e30faf584cd33055c60 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 22 Mar 2006 10:53:19 +0100 Subject: [PATCH] Intruduce DMA_28BIT_MASK This patch introduces the DMA_28BIT_MASK constant in dma-mapping.h ALSA drivers using this mask are changed to use the new constant. Signed-off-by: Tobias Klauser Acked-by: Takashi Iwai Acked-by: Jaroslav Kysela --- include/linux/dma-mapping.h | 1 + sound/pci/ad1889.c | 7 ++++--- sound/pci/emu10k1/emu10k1x.c | 13 +++++++------ sound/pci/es1968.c | 5 +++-- sound/pci/ice1712/ice1712.c | 5 +++-- sound/pci/maestro3.c | 5 +++-- sound/pci/mixart/mixart.c | 3 ++- sound/pci/pcxhr/pcxhr.c | 3 ++- 8 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2d80cc761a15..a8731062a74c 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -20,6 +20,7 @@ enum dma_data_direction { #define DMA_31BIT_MASK 0x000000007fffffffULL #define DMA_30BIT_MASK 0x000000003fffffffULL #define DMA_29BIT_MASK 0x000000001fffffffULL +#define DMA_28BIT_MASK 0x000000000fffffffULL #include diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c index a208075cdc1e..2aa5a7fdb6e0 100644 --- a/sound/pci/ad1889.c +++ b/sound/pci/ad1889.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -909,10 +910,10 @@ snd_ad1889_create(struct snd_card *card, if ((err = pci_enable_device(pci)) < 0) return err; - + /* check PCI availability (32bit DMA) */ - if (pci_set_dma_mask(pci, 0xffffffff) < 0 || - pci_set_consistent_dma_mask(pci, 0xffffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 || + pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) { printk(KERN_ERR PFX "error setting 32-bit DMA mask.\n"); pci_disable_device(pci); return -ENXIO; diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c index 1107c8ec7f78..2208dbd48be9 100644 --- a/sound/pci/emu10k1/emu10k1x.c +++ b/sound/pci/emu10k1/emu10k1x.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -893,24 +894,24 @@ static int __devinit snd_emu10k1x_create(struct snd_card *card, static struct snd_device_ops ops = { .dev_free = snd_emu10k1x_dev_free, }; - + *rchip = NULL; - + if ((err = pci_enable_device(pci)) < 0) return err; - if (pci_set_dma_mask(pci, 0x0fffffff) < 0 || - pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || + pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { snd_printk(KERN_ERR "error to set 28bit mask DMA\n"); pci_disable_device(pci); return -ENXIO; } - + chip = kzalloc(sizeof(*chip), GFP_KERNEL); if (chip == NULL) { pci_disable_device(pci); return -ENOMEM; } - + chip->card = card; chip->pci = pci; chip->irq = -1; diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c index 6a265ab3894e..dd465a186e11 100644 --- a/sound/pci/es1968.c +++ b/sound/pci/es1968.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -2561,8 +2562,8 @@ static int __devinit snd_es1968_create(struct snd_card *card, if ((err = pci_enable_device(pci)) < 0) return err; /* check, if we can restrict PCI DMA transfers to 28 bits */ - if (pci_set_dma_mask(pci, 0x0fffffff) < 0 || - pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || + pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n"); pci_disable_device(pci); return -ENXIO; diff --git a/sound/pci/ice1712/ice1712.c b/sound/pci/ice1712/ice1712.c index b96b5d6efc5d..672e198317e1 100644 --- a/sound/pci/ice1712/ice1712.c +++ b/sound/pci/ice1712/ice1712.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -2553,8 +2554,8 @@ static int __devinit snd_ice1712_create(struct snd_card *card, if ((err = pci_enable_device(pci)) < 0) return err; /* check, if we can restrict PCI DMA transfers to 28 bits */ - if (pci_set_dma_mask(pci, 0x0fffffff) < 0 || - pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || + pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n"); pci_disable_device(pci); return -ENXIO; diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c index d3ef0cc6c4f9..8bc084956c28 100644 --- a/sound/pci/maestro3.c +++ b/sound/pci/maestro3.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -2657,8 +2658,8 @@ snd_m3_create(struct snd_card *card, struct pci_dev *pci, return -EIO; /* check, if we can restrict PCI DMA transfers to 28 bits */ - if (pci_set_dma_mask(pci, 0x0fffffff) < 0 || - pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || + pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n"); pci_disable_device(pci); return -ENXIO; diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c index e79fb264532b..43ee3b2b948f 100644 --- a/sound/pci/mixart/mixart.c +++ b/sound/pci/mixart/mixart.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1289,7 +1290,7 @@ static int __devinit snd_mixart_probe(struct pci_dev *pci, pci_set_master(pci); /* check if we can restrict PCI DMA transfers to 32 bits */ - if (pci_set_dma_mask(pci, 0xffffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) { snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n"); pci_disable_device(pci); return -ENXIO; diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c index 31a3e8e1b234..f679779d96e3 100644 --- a/sound/pci/pcxhr/pcxhr.c +++ b/sound/pci/pcxhr/pcxhr.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1217,7 +1218,7 @@ static int __devinit pcxhr_probe(struct pci_dev *pci, const struct pci_device_id pci_set_master(pci); /* check if we can restrict PCI DMA transfers to 32 bits */ - if (pci_set_dma_mask(pci, 0xffffffff) < 0) { + if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) { snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n"); pci_disable_device(pci); return -ENXIO; -- cgit v1.2.3-71-gd317 From 4024ce5e0f396447cc1e07fd65c2a1d056b066bb Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 22 Mar 2006 00:07:43 -0800 Subject: [PATCH] rtc.h broke strace(1) builds Git patch 52dfa9a64cfb3dd01fa1ee1150d589481e54e28e [PATCH] move rtc_interrupt() prototype to rtc.h broke strace(1) builds. The below moves the kernel-only additions lower, under the already provided #ifdef __KERNEL__ statement. Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rtc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 0b2ba67ff13c..b739ac1f7ca0 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -11,8 +11,6 @@ #ifndef _LINUX_RTC_H_ #define _LINUX_RTC_H_ -#include - /* * The struct used to pass data via the following ioctl. Similar to the * struct tm in , but it needs to be here so that the kernel @@ -95,6 +93,8 @@ struct rtc_pll_info { #ifdef __KERNEL__ +#include + typedef struct rtc_task { void (*func)(void *private_data); void *private_data; -- cgit v1.2.3-71-gd317 From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:07:59 -0800 Subject: [PATCH] mm: PageLRU no testset PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ++--- mm/swap.c | 16 ++++++++-------- mm/vmscan.c | 20 +++++++++++--------- 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d52999c43336..58856c823f8b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -239,10 +239,9 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define __ClearPageDirty(page) __clear_bit(PG_dirty, &(page)->flags) #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags) -#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) diff --git a/mm/swap.c b/mm/swap.c index 3045a0f4c451..985324ee9368 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -214,8 +214,8 @@ void fastcall __page_cache_release(struct page *page) struct zone *zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + ClearPageLRU(page); del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -265,8 +265,8 @@ void release_pages(struct page **pages, int nr, int cold) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + ClearPageLRU(page); del_page_from_lru(zone, page); } @@ -345,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); add_page_to_inactive_list(zone, page); } if (zone) @@ -372,8 +372,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); if (TestSetPageActive(page)) BUG(); add_page_to_active_list(zone, page); diff --git a/mm/vmscan.c b/mm/vmscan.c index acb7611cd525..40fb37828e8c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1042,9 +1042,10 @@ int isolate_lru_page(struct page *page) if (PageLRU(page)) { struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (TestClearPageLRU(page)) { + if (PageLRU(page)) { ret = 1; get_page(page); + ClearPageLRU(page); if (PageActive(page)) del_page_from_active_list(zone, page); else @@ -1085,6 +1086,8 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); + BUG_ON(!PageLRU(page)); + list_del(&page->lru); if (unlikely(get_page_testone(page))) { /* @@ -1100,8 +1103,7 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, * the page is not being freed elsewhere -- the page release * code relies on it. */ - if (!TestClearPageLRU(page)) - BUG(); + ClearPageLRU(page); list_add(&page->lru, dst); nr_taken++; } @@ -1156,8 +1158,8 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -1276,8 +1278,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); @@ -1305,8 +1307,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; -- cgit v1.2.3-71-gd317 From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: PageActive no testset PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 -- mm/swap.c | 4 ++-- mm/vmscan.c | 5 +++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 58856c823f8b..5d1e7bd85107 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -246,8 +246,6 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) -#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) diff --git a/mm/swap.c b/mm/swap.c index 985324ee9368..cf88226cf96d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -374,8 +374,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec) } BUG_ON(PageLRU(page)); SetPageLRU(page); - if (TestSetPageActive(page)) - BUG(); + BUG_ON(PageActive(page)); + SetPageActive(page); add_page_to_active_list(zone, page); } if (zone) diff --git a/mm/vmscan.c b/mm/vmscan.c index 40fb37828e8c..8e477b1a4838 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1280,8 +1280,9 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) prefetchw_prev_lru_page(page, &l_inactive, flags); BUG_ON(PageLRU(page)); SetPageLRU(page); - if (!TestClearPageActive(page)) - BUG(); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { -- cgit v1.2.3-71-gd317 From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: less atomic ops In the page release paths, we can be sure that nobody will mess with our page->flags because the refcount has dropped to 0. So no need for atomic operations here. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- include/linux/page-flags.h | 2 ++ mm/swap.c | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8ac854f7f190..3b6723dfaff3 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page) { list_del(&page->lru); if (PageActive(page)) { - ClearPageActive(page); + __ClearPageActive(page); zone->nr_active--; } else { zone->nr_inactive--; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5d1e7bd85107..da71d63df465 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -242,10 +242,12 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define PageLRU(page) test_bit(PG_lru, &(page)->flags) #define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define ClearPageLRU(page) clear_bit(PG_lru, &(page)->flags) +#define __ClearPageLRU(page) __clear_bit(PG_lru, &(page)->flags) #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) diff --git a/mm/swap.c b/mm/swap.c index cf88226cf96d..91b7e2026f69 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -215,7 +215,7 @@ void fastcall __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); BUG_ON(!PageLRU(page)); - ClearPageLRU(page); + __ClearPageLRU(page); del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -266,7 +266,7 @@ void release_pages(struct page **pages, int nr, int cold) spin_lock_irq(&zone->lru_lock); } BUG_ON(!PageLRU(page)); - ClearPageLRU(page); + __ClearPageLRU(page); del_page_from_lru(zone, page); } -- cgit v1.2.3-71-gd317 From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:01 -0800 Subject: [PATCH] mm: page_alloc less atomics More atomic operation removal from page allocator Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- mm/page_alloc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index da71d63df465..76c7ffdd0424 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) #define PageCompound(page) test_bit(PG_compound, &(page)->flags) -#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) -#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define __SetPageCompound(page) __set_bit(PG_compound, &(page)->flags) +#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags) #ifdef CONFIG_SWAP #define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61775866ea18..102919851353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -190,7 +190,7 @@ static void prep_compound_page(struct page *page, unsigned long order) for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - SetPageCompound(p); + __SetPageCompound(p); set_page_private(p, (unsigned long)page); } } @@ -209,7 +209,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageCompound(p) | (page_private(p) != (unsigned long)page))) bad_page(page); - ClearPageCompound(p); + __ClearPageCompound(p); } } -- cgit v1.2.3-71-gd317 From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:02 -0800 Subject: [PATCH] mm: slab less atomics Atomic operation removal from slab Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 ++---- mm/slab.c | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 76c7ffdd0424..8cef69d462f2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -250,10 +250,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); #define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) -#define ClearPageSlab(page) clear_bit(PG_slab, &(page)->flags) -#define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags) -#define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) +#define __SetPageSlab(page) __set_bit(PG_slab, &(page)->flags) +#define __ClearPageSlab(page) __clear_bit(PG_slab, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) is_highmem(page_zone(page)) diff --git a/mm/slab.c b/mm/slab.c index d0bd7f07ab04..5988adf010c5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1402,7 +1402,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) atomic_add(i, &slab_reclaim_pages); add_page_state(nr_slab, i); while (i--) { - SetPageSlab(page); + __SetPageSlab(page); page++; } return addr; @@ -1418,8 +1418,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) const unsigned long nr_freed = i; while (i--) { - if (!TestClearPageSlab(page)) - BUG(); + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); page++; } sub_page_state(nr_slab, nr_freed); -- cgit v1.2.3-71-gd317 From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:03 -0800 Subject: [PATCH] mm: simplify vmscan vs release refcounting The VM has an interesting race where a page refcount can drop to zero, but it is still on the LRU lists for a short time. This was solved by testing a 0->1 refcount transition when picking up pages from the LRU, and dropping the refcount in that case. Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page from the LRU, thus a 0 refcount page will never have its refcount elevated until it is allocated again. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 +++++++++++-------- mm/vmscan.c | 25 +++++++++++-------------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 498ff8778fb6..b12d5c76420d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -301,17 +301,20 @@ struct page { * Drop a ref, return true if the logical refcount fell to zero (the page has * no users) */ -#define put_page_testzero(p) \ - ({ \ - BUG_ON(atomic_read(&(p)->_count) == -1);\ - atomic_add_negative(-1, &(p)->_count); \ - }) +static inline int put_page_testzero(struct page *page) +{ + BUG_ON(atomic_read(&page->_count) == -1); + return atomic_add_negative(-1, &page->_count); +} /* - * Grab a ref, return true if the page previously had a logical refcount of - * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page + * Try to grab a ref unless the page has a refcount of zero, return false if + * that is the case. */ -#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) +static inline int get_page_unless_zero(struct page *page) +{ + return atomic_add_unless(&page->_count, 1, -1); +} #define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) #define __put_page(p) atomic_dec(&(p)->_count) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e477b1a4838..e21bab4deda6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1083,29 +1083,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, int scan = 0; while (scan++ < nr_to_scan && !list_empty(src)) { + struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); BUG_ON(!PageLRU(page)); list_del(&page->lru); - if (unlikely(get_page_testone(page))) { + target = src; + if (likely(get_page_unless_zero(page))) { /* - * It is being freed elsewhere + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. */ - __put_page(page); - list_add(&page->lru, src); - continue; - } + ClearPageLRU(page); + target = dst; + nr_taken++; + } /* else it is being freed elsewhere */ - /* - * Be careful not to clear PageLRU until after we're sure - * the page is not being freed elsewhere -- the page release - * code relies on it. - */ - ClearPageLRU(page); - list_add(&page->lru, dst); - nr_taken++; + list_add(&page->lru, target); } *scanned = scan; -- cgit v1.2.3-71-gd317 From 8dc04efbfb3c08a08fb7a3b97348d5d561b26ae2 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:03 -0800 Subject: [PATCH] mm: de-skew page refcounting atomic_add_unless (atomic_inc_not_zero) no longer requires an offset refcount to function correctly. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b12d5c76420d..9bbddf228cd9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -286,15 +286,6 @@ struct page { * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. - * - * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we - * can use atomic_add_negative(-1, page->_count) to detect when the page - * becomes free and so that we can also use atomic_inc_and_test to atomically - * detect when we just tried to grab a ref on a page which some other CPU has - * already deemed to be freeable. - * - * NO code should make assumptions about this internal detail! Use the provided - * macros which retain the old rules: page_count(page) == 0 is a free page. */ /* @@ -303,8 +294,8 @@ struct page { */ static inline int put_page_testzero(struct page *page) { - BUG_ON(atomic_read(&page->_count) == -1); - return atomic_add_negative(-1, &page->_count); + BUG_ON(atomic_read(&page->_count) == 0); + return atomic_dec_and_test(&page->_count); } /* @@ -313,10 +304,10 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - return atomic_add_unless(&page->_count, 1, -1); + return atomic_inc_not_zero(&page->_count); } -#define set_page_count(p,v) atomic_set(&(p)->_count, (v) - 1) +#define set_page_count(p,v) atomic_set(&(p)->_count, (v)) #define __put_page(p) atomic_dec(&(p)->_count) extern void FASTCALL(__page_cache_release(struct page *)); @@ -325,7 +316,7 @@ static inline int page_count(struct page *page) { if (PageCompound(page)) page = (struct page *)page_private(page); - return atomic_read(&page->_count) + 1; + return atomic_read(&page->_count); } static inline void get_page(struct page *page) -- cgit v1.2.3-71-gd317 From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:05 -0800 Subject: [PATCH] mm: split highorder pages Have an explicit mm call to split higher order pages into individual pages. Should help to avoid bugs and be more explicit about the code's intention. Signed-off-by: Nick Piggin Cc: Russell King Cc: David Howells Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Zankel Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/consistent.c | 4 ++-- arch/frv/mm/dma-alloc.c | 4 +--- arch/mips/mm/init.c | 5 +++-- arch/ppc/kernel/dma-mapping.c | 4 ++-- arch/sh/mm/consistent.c | 3 +-- arch/xtensa/mm/pgtable.c | 10 +++------- include/linux/mm.h | 6 ++++++ mm/memory.c | 4 +--- mm/page_alloc.c | 22 ++++++++++++++++++++++ 9 files changed, 41 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index c2ee18d2075e..8a1bfcd50087 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c @@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, pte = consistent_pte[idx] + off; c->vm_pages = page; + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); /* * x86 does not mark the pages reserved... */ @@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index 342823aad758..636b2f8b5d98 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle) */ if (order > 0) { struct page *rpage = virt_to_page(page); - - for (i = 1; i < (1 << order); i++) - set_page_count(rpage + i, 1); + split_page(rpage, order); } err = 0; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 0ff9a348b843..a140da9732db 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask; */ unsigned long setup_zero_pages(void) { - unsigned long order, size; + unsigned int order; + unsigned long size; struct page *page; if (cpu_has_vce) @@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void) panic("Oh boy, that early out of memory?"); page = virt_to_page(empty_zero_page); + split_page(page, order); while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) { SetPageReserved(page); - set_page_count(page, 1); page++; } diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 685fd0defe23..61465ec88bc7 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c @@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); struct page *end = page + (1 << order); + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); SetPageReserved(page); set_pte_at(&init_mm, vaddr, pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); @@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c index df3a9e452cc5..ee73e30263af 100644 --- a/arch/sh/mm/consistent.c +++ b/arch/sh/mm/consistent.c @@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) page = alloc_pages(gfp, order); if (!page) return NULL; + split_page(page, order); ret = page_address(page); *handle = virt_to_phys(ret); @@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) end = page + (1 << order); while (++page < end) { - set_page_count(page, 1); - /* Free any unused pages */ if (page >= free) { __free_page(page); diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c index cbc56aedf13e..7d28914d11cb 100644 --- a/arch/xtensa/mm/pgtable.c +++ b/arch/xtensa/mm/pgtable.c @@ -21,13 +21,9 @@ pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER); if (likely(p)) { - struct page *page; + split_page(virt_to_page(p), COLOR_ORDER); for (i = 0; i < COLOR_SIZE; i++) { - page = virt_to_page(p); - - set_page_count(page, 1); - if (ADDR_COLOR(p) == color) pte = p; else @@ -55,9 +51,9 @@ struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address) p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); if (likely(p)) { - for (i = 0; i < PAGE_ORDER; i++) { - set_page_count(p, 1); + split_page(p, COLOR_ORDER); + for (i = 0; i < PAGE_ORDER; i++) { if (PADDR_COLOR(page_address(p)) == color) page = p; else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bbddf228cd9..e67980654c49 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -328,6 +328,12 @@ static inline void get_page(struct page *page) void put_page(struct page *page); +#ifdef CONFIG_MMU +void split_page(struct page *page, unsigned int order); +#else +static inline void split_page(struct page *page, unsigned int order) {} +#endif + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db29..6af555c1c42a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1221,9 +1221,7 @@ out: * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 102919851353..fc65e87368b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -752,6 +752,28 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_MMU +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1< 0 path. Saves a branch -- cgit v1.2.3-71-gd317 From 9d41415221214ca4820b9464dfa548e2f20e7dd5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:06 -0800 Subject: [PATCH] mm: page_state comment more Clarify that preemption needs to be guarded against with the __xxx_page_state functions. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8cef69d462f2..9ea629c02a4b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -86,8 +86,9 @@ * - The __xxx_page_state variants can be used safely when interrupts are * disabled. * - The __xxx_page_state variants can be used if the field is only - * modified from process context, or only modified from interrupt context. - * In this case, the field should be commented here. + * modified from process context and protected from preemption, or only + * modified from interrupt context. In this case, the field should be + * commented here. */ struct page_state { unsigned long nr_dirty; /* Dirty writeable pages */ -- cgit v1.2.3-71-gd317 From b50ec7d8070ae7a39fe78e65a8812bbc3ca2f7ac Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 22 Mar 2006 00:08:09 -0800 Subject: [PATCH] kcalloc(): INT_MAX -> ULONG_MAX Since size_t has the same size as a long on all architectures, it's enough for overflow checks to check against ULONG_MAX. This change could allow a compiler better optimization (especially in the n=1 case). The practical effect seems to be positive, but quite small: text data bss dec hex filename 21762380 5859870 1848928 29471178 1c1b1ca vmlinux-old 21762211 5859870 1848928 29471009 1c1b121 vmlinux-patched Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 8cf52939d0ab..38bed95dda7a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -118,7 +118,7 @@ extern void *kzalloc(size_t, gfp_t); */ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) { - if (n != 0 && size > INT_MAX / n) + if (n != 0 && size > ULONG_MAX / n) return NULL; return kzalloc(n * size, flags); } -- cgit v1.2.3-71-gd317 From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:15 -0800 Subject: [PATCH] slab: Remove SLAB_NO_REAP option SLAB_NO_REAP is documented as an option that will cause this slab not to be reaped under memory pressure. However, that is not what happens. The only thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused slab elements that were allocated in batch in cache_reap(). Cache_reap() is run every few seconds independently of memory pressure. Could we remove the whole thing? Its only used by three slabs anyways and I cannot find a reason for having this option. There is an additional problem with SLAB_NO_REAP. If set then the recovery of objects from alien caches is switched off. Objects not freed on the same node where they were initially allocated will only be reused if a certain amount of objects accumulates from one alien node (not very likely) or if the cache is explicitly shrunk. (Strangely __cache_shrink does not check for SLAB_NO_REAP) Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/iscsi_tcp.c | 2 +- fs/ocfs2/super.c | 2 +- include/linux/slab.h | 1 - mm/slab.c | 13 ++----------- 4 files changed, 4 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index ff79e68b347c..7b82ff090d42 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -3639,7 +3639,7 @@ iscsi_tcp_init(void) taskcache = kmem_cache_create("iscsi_taskcache", sizeof(struct iscsi_data_task), 0, - SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL); + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!taskcache) return -ENOMEM; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8dd3aafec499..09e1c57a86a0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void) ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", sizeof(struct ocfs2_journal_lock), 0, - SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!ocfs2_lock_cache) return -ENOMEM; diff --git a/include/linux/slab.h b/include/linux/slab.h index 38bed95dda7a..2b28c849d75a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t; #define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */ #define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */ #define SLAB_POISON 0x00000800UL /* Poison objects */ -#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */ #define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ #define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ #define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ diff --git a/mm/slab.c b/mm/slab.c index 5c2574989834..24235506b2a0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -170,12 +170,12 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) @@ -662,7 +662,6 @@ static struct kmem_cache cache_cache = { .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, .buffer_size = sizeof(struct kmem_cache), - .flags = SLAB_NO_REAP, .name = "kmem_cache", #if DEBUG .obj_size = sizeof(struct kmem_cache), @@ -1848,9 +1847,6 @@ static void setup_cpu_cache(struct kmem_cache *cachep) * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * - * %SLAB_NO_REAP - Don't automatically reap this cache when we're under - * memory pressure. - * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. @@ -3584,10 +3580,6 @@ static void cache_reap(void *unused) struct slab *slabp; searchp = list_entry(walk, struct kmem_cache, next); - - if (searchp->flags & SLAB_NO_REAP) - goto next; - check_irq_on(); l3 = searchp->nodelists[numa_node_id()]; @@ -3635,7 +3627,6 @@ static void cache_reap(void *unused) } while (--tofree > 0); next_unlock: spin_unlock_irq(&l3->list_lock); -next: cond_resched(); } check_irq_on(); -- cgit v1.2.3-71-gd317 From 78eef01b0fae087c5fadbd85dd4fe2918c3a015f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:16 -0800 Subject: [PATCH] on_each_cpu(): disable local interrupts When on_each_cpu() runs the callback on other CPUs, it runs with local interrupts disabled. So we should run the function with local interrupts disabled on this CPU, too. And do the same for UP, so the callback is run in the same environment on both UP and SMP. (strictly it should do preempt_disable() too, but I think local_irq_disable is sufficiently equivalent). Also uninlines on_each_cpu(). softirq.c was the most appropriate file I could find, but it doesn't seem to justify creating a new file. Oh, and fix up that comment over (under?) x86's smp_call_function(). It drives me nuts. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smp.c | 28 ++++++++++++---------------- include/linux/smp.h | 23 +++++++++-------------- kernel/softirq.c | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 218d725a5a1e..d134e9643a58 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void) spin_unlock_irq(&call_lock); } -static struct call_data_struct * call_data; - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ - -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -/* - * [SUMMARY] Run a function on all other CPUs. - * The function to run. This must be fast and non-blocking. - * An arbitrary pointer to pass to the function. - * currently unused. - * If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. Does not return until +static struct call_data_struct *call_data; + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until * remote CPUs are nearly ready to execute <> or are or have executed. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) { struct call_data_struct data; int cpus; diff --git a/include/linux/smp.h b/include/linux/smp.h index 44153fdf73fc..d699a16b0cb2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ -extern int smp_call_function (void (*func) (void *info), void *info, - int retry, int wait); +int smp_call_function(void(*func)(void *info), void *info, int retry, int wait); /* * Call a function on all processors */ -static inline int on_each_cpu(void (*func) (void *info), void *info, - int retry, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, retry, wait); - func(info); - preempt_enable(); - return ret; -} +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait); #define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */ #define MSG_ALL 0x8001 @@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void); #define raw_smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) -#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) +#define on_each_cpu(func,info,retry,wait) \ + ({ \ + local_irq_disable(); \ + func(info); \ + local_irq_enable(); \ + 0; \ + }) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) diff --git a/kernel/softirq.c b/kernel/softirq.c index ad3295cdded5..ec8fed42a86f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -16,6 +16,7 @@ #include #include #include +#include #include /* @@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } + +#ifdef CONFIG_SMP +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) +{ + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, retry, wait); + local_irq_disable(); + func(info); + local_irq_enable(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); +#endif -- cgit v1.2.3-71-gd317 From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:19 -0800 Subject: [PATCH] vmscan: use unsigned longs Turn basically everything in vmscan.c into `unsigned long'. This is to avoid the possibility that some piece of code in there might decide to operate upon more than 4G (or even 2G) of pages in one hit. This might be silly, but we'll need it one day. Cc: Christoph Lameter Cc: Nick Piggin Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/swap.h | 8 ++-- mm/vmscan.c | 104 +++++++++++++++++++++++++++++---------------------- 3 files changed, 64 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e67980654c49..1850cf8bad64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1046,7 +1046,7 @@ int in_gate_area_no_task(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages); void drop_pagecache(void); void drop_slab(void); diff --git a/include/linux/swap.h b/include/linux/swap.h index d572b19afb7d..3dc6c89c49b8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,8 +172,8 @@ extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, gfp_t); -extern int shrink_all_memory(int); +extern unsigned long try_to_free_pages(struct zone **, gfp_t); +extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; #ifdef CONFIG_NUMA @@ -190,11 +190,11 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p); -extern int putback_lru_pages(struct list_head *l); +extern unsigned long putback_lru_pages(struct list_head *l); extern int migrate_page(struct page *, struct page *); extern void migrate_page_copy(struct page *, struct page *); extern int migrate_page_remove_references(struct page *, struct page *, int); -extern int migrate_pages(struct list_head *l, struct list_head *t, +extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed); extern int fail_migrate_page(struct page *, struct page *); #else diff --git a/mm/vmscan.c b/mm/vmscan.c index 5feef4d4650e..62cd7cd257e3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -177,10 +177,11 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - int ret = 0; + unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -410,12 +411,13 @@ cannot_free: /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static unsigned long shrink_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - int reclaimed = 0; + unsigned long reclaimed = 0; cond_resched(); @@ -599,11 +601,11 @@ static inline void move_to_lru(struct page *page) * * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +unsigned long putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; + unsigned long count = 0; list_for_each_entry_safe(page, page2, l, lru) { move_to_lru(page); @@ -848,11 +850,11 @@ EXPORT_SYMBOL(migrate_page); * * Return: Number of pages not migrated when "to" ran empty. */ -int migrate_pages(struct list_head *from, struct list_head *to, +unsigned long migrate_pages(struct list_head *from, struct list_head *to, struct list_head *moved, struct list_head *failed) { - int retry; - int nr_failed = 0; + unsigned long retry; + unsigned long nr_failed = 0; int pass = 0; struct page *page; struct page *page2; @@ -1069,12 +1071,13 @@ int isolate_lru_page(struct page *page) * * returns how many pages were moved onto *@dst. */ -static int isolate_lru_pages(int nr_to_scan, struct list_head *src, - struct list_head *dst, int *scanned) +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) { - int nr_taken = 0; + unsigned long nr_taken = 0; struct page *page; - int scan = 0; + unsigned long scan = 0; while (scan++ < nr_to_scan && !list_empty(src)) { struct list_head *target; @@ -1106,20 +1109,22 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, /* * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ -static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc) +static void shrink_cache(unsigned long max_scan, struct zone *zone, + struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; + unsigned long nr_scanned = 0; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - while (max_scan > 0) { + do { struct page *page; - int nr_taken; - int nr_scan; - int nr_freed; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -1131,7 +1136,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s if (nr_taken == 0) goto done; - max_scan -= nr_scan; + nr_scanned += nr_scan; nr_freed = shrink_list(&page_list, sc); local_irq_disable(); @@ -1161,7 +1166,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s spin_lock_irq(&zone->lru_lock); } } - } + } while (nr_scanned < max_scan); spin_unlock_irq(&zone->lru_lock); done: pagevec_release(&pvec); @@ -1185,11 +1190,12 @@ done: * But we had to alter page->flags anyway. */ static void -refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc) +refill_inactive_zone(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { - int pgmoved; + unsigned long pgmoved; int pgdeactivate = 0; - int pgscanned; + unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1323,8 +1329,8 @@ refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void -shrink_zone(int priority, struct zone *zone, struct scan_control *sc) +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; @@ -1387,8 +1393,8 @@ shrink_zone(int priority, struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void -shrink_caches(int priority, struct zone **zones, struct scan_control *sc) +static void shrink_caches(int priority, struct zone **zones, + struct scan_control *sc) { int i; @@ -1425,11 +1431,12 @@ shrink_caches(int priority, struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - int total_scanned = 0, total_reclaimed = 0; + unsigned long total_scanned = 0; + unsigned long total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; @@ -1525,13 +1532,15 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) { - int to_free = nr_pages; + unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; - int total_scanned, total_reclaimed; + unsigned long total_scanned; + unsigned long total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1776,22 +1785,23 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -int shrink_all_memory(int nr_pages) +unsigned long shrink_all_memory(unsigned long nr_pages) { pg_data_t *pgdat; - int nr_to_free = nr_pages; - int ret = 0; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { - int freed; + unsigned long freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if (nr_to_free <= 0) + if ((long)nr_to_free <= 0) break; } current->reclaim_state = NULL; @@ -1805,8 +1815,7 @@ int shrink_all_memory(int nr_pages) away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { pg_data_t *pgdat; cpumask_t mask; @@ -1826,10 +1835,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; + swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for_each_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + pgdat->kswapd = find_task_by_pid(pid); + } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1873,7 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ; */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - const int nr_pages = 1 << order; + const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; @@ -1881,7 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), .nr_mapped = read_page_state(nr_mapped), - .swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, }; @@ -1966,4 +1981,3 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return __zone_reclaim(zone, gfp_mask, order); } #endif - -- cgit v1.2.3-71-gd317 From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:33 -0800 Subject: [PATCH] mm: make __put_page internal Remove __put_page from outside the core mm/. It is dangerous because it does not handle compound pages nicely, and misses 1->0 transitions. If a user later appears that really needs the extra speed we can reevaluate. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/filemap.c | 2 ++ mm/internal.h | 11 +++++++++++ mm/vmscan.c | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1850cf8bad64..9b3cdfc8046d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,7 +308,6 @@ static inline int get_page_unless_zero(struct page *page) } #define set_page_count(p,v) atomic_set(&(p)->_count, (v)) -#define __put_page(p) atomic_dec(&(p)->_count) extern void FASTCALL(__page_cache_release(struct page *)); diff --git a/mm/filemap.c b/mm/filemap.c index 44da3d476994..e8f58f7dd7a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,6 +30,8 @@ #include #include #include "filemap.h" +#include "internal.h" + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ diff --git a/mm/internal.h b/mm/internal.h index 17256bb2f4ef..e3042db2a2d6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,6 +8,10 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H + +#include static inline void set_page_refs(struct page *page, int order) { @@ -26,5 +30,12 @@ static inline void set_page_refs(struct page *page, int order) #endif /* CONFIG_MMU */ } +static inline void __put_page(struct page *page) +{ + atomic_dec(&page->_count); +} + extern void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order); + +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 486184d2b50c..3914a94aa905 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,8 @@ #include +#include "internal.h" + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ -- cgit v1.2.3-71-gd317 From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:34 -0800 Subject: [PATCH] mm: nommu use compound pages Now that compound page handling is properly fixed in the VM, move nommu over to using compound pages rather than rolling their own refcounting. nommu vm page refcounting is broken anyway, but there is no need to have divergent code in the core VM now, nor when it gets fixed. Signed-off-by: Nick Piggin Cc: David Howells (Needs testing, please). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 3 +-- include/linux/mm.h | 4 ---- mm/internal.h | 12 ------------ mm/nommu.c | 4 ++-- mm/page_alloc.c | 7 ------- mm/slab.c | 9 ++++++++- 6 files changed, 11 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 3f810acd0bfa..b1ca234068f6 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) xpages = 1UL << order; npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; - for (loop = 0; loop < npages; loop++) - set_page_count(pages + loop, 1); + split_page(pages, order); /* trim off any pages we don't actually require */ for (loop = npages; loop < xpages; loop++) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b3cdfc8046d..3d84b7a35e0d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -327,11 +327,7 @@ static inline void get_page(struct page *page) void put_page(struct page *page); -#ifdef CONFIG_MMU void split_page(struct page *page, unsigned int order); -#else -static inline void split_page(struct page *page, unsigned int order) {} -#endif /* * Multiple processes may "see" the same page. E.g. for untouched diff --git a/mm/internal.h b/mm/internal.h index e3042db2a2d6..7bb339779818 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -15,19 +15,7 @@ static inline void set_page_refs(struct page *page, int order) { -#ifdef CONFIG_MMU set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ } static inline void __put_page(struct page *page) diff --git a/mm/nommu.c b/mm/nommu.c index 4951f4786f28..db45efac17cc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) /* * kmalloc doesn't like __GFP_HIGHMEM for some reason */ - return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } struct page * vmalloc_to_page(void *addr) @@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL); + base = kmalloc(len, GFP_KERNEL|__GFP_COMP); if (!base) goto enomem; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7aa0181287e1..e197818a7cf6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -422,11 +422,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) mutex_debug_check_no_locks_freed(page_address(page), PAGE_SIZE<lru.next; } @@ -600,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct slab *)page->lru.prev; } @@ -2412,8 +2416,11 @@ static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, struct page *page; /* Nasty!!!!!! I hope this is OK. */ - i = 1 << cachep->gfporder; page = virt_to_page(objp); + + i = 1; + if (likely(!PageCompound(page))) + i <<= cachep->gfporder; do { page_set_cache(page, cachep); page_set_slab(page, slabp); -- cgit v1.2.3-71-gd317 From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:40 -0800 Subject: [PATCH] remove set_page_count() outside mm/ set_page_count usage outside mm/ is limited to setting the refcount to 1. Remove set_page_count from outside mm/, and replace those users with init_page_count() and set_page_refcounted(). This allows more debug checking, and tighter control on how code is allowed to play around with page->_count. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 2 +- arch/arm/mm/init.c | 2 +- arch/arm26/mm/init.c | 2 +- arch/cris/mm/init.c | 2 +- arch/frv/mm/init.c | 6 +++--- arch/h8300/mm/init.c | 4 ++-- arch/i386/mm/init.c | 6 +++--- arch/ia64/mm/init.c | 6 +++--- arch/m32r/mm/init.c | 4 ++-- arch/m68k/mm/init.c | 2 +- arch/m68k/mm/memory.c | 2 +- arch/m68k/mm/motorola.c | 2 +- arch/m68knommu/mm/init.c | 4 ++-- arch/mips/arc/memory.c | 2 +- arch/mips/dec/prom/memory.c | 2 +- arch/mips/mips-boards/generic/memory.c | 2 +- arch/mips/mips-boards/sim/sim_mem.c | 2 +- arch/mips/mm/init.c | 6 +++--- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/parisc/mm/init.c | 4 ++-- arch/powerpc/mm/init_32.c | 4 ++-- arch/powerpc/mm/init_64.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/powerpc/platforms/cell/setup.c | 2 +- arch/ppc/mm/init.c | 6 +++--- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/sh64/mm/init.c | 4 ++-- arch/sparc/kernel/sun4d_smp.c | 6 +++--- arch/sparc/kernel/sun4m_smp.c | 6 +++--- arch/sparc/mm/init.c | 6 +++--- arch/sparc64/mm/init.c | 4 ++-- arch/um/kernel/mem.c | 4 ++-- arch/x86_64/mm/init.c | 6 +++--- arch/xtensa/mm/init.c | 2 +- drivers/video/acornfb.c | 2 +- include/linux/mm.h | 11 +++++++++-- mm/hugetlb.c | 5 +++-- mm/internal.h | 13 ++++++++++++- mm/page_alloc.c | 14 ++++++-------- 40 files changed, 96 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 486d7945583d..544ac5dc09eb 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end) void *__start = start; for (; __start < end; __start += PAGE_SIZE) { ClearPageReserved(virt_to_page(__start)); - set_page_count(virt_to_page(__start), 1); + init_page_count(virt_to_page(__start)); free_page((long)__start); totalram_pages++; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 8b276ee38acf..b0321e943b76 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c index 1f09a9d0fb83..e3ecaa453747 100644 --- a/arch/arm26/mm/init.c +++ b/arch/arm26/mm/init.c @@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 31a0018b525a..b7842ff213a6 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -216,7 +216,7 @@ free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 765088ea8a50..8899aa1a4f06 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -169,7 +169,7 @@ void __init mem_init(void) struct page *page = &mem_map[pfn]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; } @@ -210,7 +210,7 @@ void __init free_initmem(void) /* next to check that the page we free is not a partial page */ for (addr = start; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1e0929ddc8c4..09efc4b1f038 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -219,7 +219,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2700f01994ba..7ba55a6e2dbc 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) static void __meminit free_new_highpage(struct page *page) { - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -727,7 +727,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)addr, 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b38b6d213c15..08d94e6bfa18 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -197,7 +197,7 @@ free_initmem (void) eaddr = (unsigned long) ia64_imva(__init_end); while (addr < eaddr) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); ++totalram_pages; addr += PAGE_SIZE; @@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end) continue; page = virt_to_page(start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(start); ++totalram_pages; } @@ -640,7 +640,7 @@ mem_init (void) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 6facf15b04f3..c9e7dad860b7 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -226,7 +226,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index c45beb955943..a190e39c907a 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index 559942ce0e1e..d6d582a5abb0 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable) /* unreserve the page so it's possible to free that page */ PD_PAGE(dp)->flags &= ~(1 << PG_reserved); - set_page_count(PD_PAGE(dp), 1); + init_page_count(PD_PAGE(dp)); return; } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d855fec26317..afb57eeafdcb 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -276,7 +276,7 @@ void free_initmem(void) addr = (unsigned long)&__init_begin; for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) { virt_to_page(addr)->flags &= ~(1 << PG_reserved); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c index 89f0b554ffb7..d79503fe6e42 100644 --- a/arch/m68knommu/mm/init.c +++ b/arch/m68knommu/mm/init.c @@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -218,7 +218,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c index 958d2eb78862..8a9ef58cc399 100644 --- a/arch/mips/arc/memory.c +++ b/arch/mips/arc/memory.c @@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index 81cb5a76cfb7..1edaf3074ee9 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void) addr = PAGE_SIZE; while (addr < end) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; } diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c index 2c8afd77a20b..ee5e70c95cf3 100644 --- a/arch/mips/mips-boards/generic/memory.c +++ b/arch/mips/mips-boards/generic/memory.c @@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c index 0dbd7435bb2a..1ec4e75656bd 100644 --- a/arch/mips/mips-boards/sim/sim_mem.c +++ b/arch/mips/mips-boards/sim/sim_mem.c @@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index a140da9732db..52f7d59fe612 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -245,7 +245,7 @@ void __init mem_init(void) #ifdef CONFIG_LIMITED_DMA set_page_address(page, lowmem_page_address(page)); #endif - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -292,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -315,7 +315,7 @@ void free_initmem(void) page = addr; #endif ClearPageReserved(virt_to_page(page)); - set_page_count(virt_to_page(page), 1); + init_page_count(virt_to_page(page)); free_page(page); totalram_pages++; freed += PAGE_SIZE; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index ed93a9792959..e0d095daa5ed 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -559,7 +559,7 @@ void __init mem_init(void) /* if (!page_is_ram(pgnr)) continue; */ /* commented out until page_is_ram works */ ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 7847ca13d6c2..852eda3953dc 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -398,7 +398,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); num_physpages++; totalram_pages++; @@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); num_physpages++; totalram_pages++; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 7d0d75c11848..b57fb3a2b7bb 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 81cfb0c2ec58..bacb71c89811 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -140,7 +140,7 @@ void free_initmem(void) for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { memset((void *)addr, 0xcc, PAGE_SIZE); ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 6ae5c130d0db..454cac01d8cc 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot); void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -376,7 +376,7 @@ void __init mem_init(void) struct page *page = pfn_to_page(pfn); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index b33a4443f5a9..fec8e65b36ea 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); set_page_links(page, ZONE_DMA, node_id, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 134db5c04203..cb1c294fb932 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -441,7 +441,7 @@ void __init mem_init(void) struct page *page = mem_map + pfn; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index df953383724d..a055894f3bd8 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,7 +292,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index e342565f75fb..77b4a838fe10 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -273,7 +273,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c index a65e8bb2c3cc..1169757fb38b 100644 --- a/arch/sh64/mm/init.c +++ b/arch/sh64/mm/init.c @@ -173,7 +173,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 40d426cce824..4219dd2ce3a2 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void) /* Free unneeded trap tables */ ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a21f27d10e55..fbbd8a474c4c 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void) /* Free unneeded trap tables */ if (!cpu_isset(i, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; } if (!cpu_isset(2, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; } if (!cpu_isset(3, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index c03babaa0498..898669732466 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn) struct page *page = pfn_to_page(tmp); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -480,7 +480,7 @@ void free_initmem (void) p = virt_to_page(addr); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; num_physpages++; @@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; } diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index c2b556106fc1..2ae143ba50d8 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1461,7 +1461,7 @@ void free_initmem(void) p = virt_to_page(page); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; @@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index fa4f915be5c5..92cce96b5e24 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start, for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ page = &mem_map[highmem_pfn + i]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); } } @@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 7af1742aa958..40ed13d263cd 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -592,7 +592,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 5a91d6c9e66d..e1be4235f367 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end) { for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page((unsigned long)start); totalram_pages++; } diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index b058273527bb..76448d6ae896 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end) */ page = virt_to_page(virtual_start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(virtual_start); virtual_start += PAGE_SIZE; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d84b7a35e0d..7d8c127daad7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -307,8 +307,6 @@ static inline int get_page_unless_zero(struct page *page) return atomic_inc_not_zero(&page->_count); } -#define set_page_count(p,v) atomic_set(&(p)->_count, (v)) - extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) @@ -325,6 +323,15 @@ static inline void get_page(struct page *page) atomic_inc(&page->_count); } +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + atomic_set(&page->_count, 1); +} + void put_page(struct page *page); void split_page(struct page *page, unsigned int order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39d49ecea8e8..20117a4b8ab6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include +#include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages; @@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) return NULL; } spin_unlock(&hugetlb_lock); - set_page_count(page, 1); + set_page_refcounted(page); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) clear_user_highpage(&page[i], addr); return page; @@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page) 1 << PG_private | 1<< PG_writeback); } page[1].lru.next = NULL; - set_page_count(page, 1); + set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } diff --git a/mm/internal.h b/mm/internal.h index 7bb339779818..d20e3cc4aef0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,8 +13,19 @@ #include -static inline void set_page_refs(struct page *page, int order) +static inline void set_page_count(struct page *page, int v) { + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e197818a7cf6..7f65b5a63bb3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - set_page_refs(page, 0); + set_page_refcounted(page); __free_page(page); } else { int loop; @@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) set_page_count(p, 0); } - set_page_refs(page, order); + set_page_refcounted(page); __free_pages(page, order); } } @@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); return 0; } @@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order) BUG_ON(PageCompound(page)); BUG_ON(!page_count(page)); - for (i = 1; i < (1 << order); i++) { - BUG_ON(page_count(page + i)); - set_page_count(page + i, 1); - } + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); } /* @@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); -- cgit v1.2.3-71-gd317 From 617d2214ee06c209e5c375c280d50abace8058e1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:43 -0800 Subject: [PATCH] mm: optimise page_count Optimise page_count compound page test and make it consistent with similar functions. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d8c127daad7..6aa016f1d3ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -311,7 +311,7 @@ extern void FASTCALL(__page_cache_release(struct page *)); static inline int page_count(struct page *page) { - if (PageCompound(page)) + if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); return atomic_read(&page->_count); } -- cgit v1.2.3-71-gd317 From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/pgtable.h | 5 ++--- include/asm-ia64/pgtable.h | 2 +- include/asm-x86_64/pgtable.h | 4 ++-- include/linux/hugetlb.h | 4 ++++ mm/hugetlb.c | 29 +++++++++++++++++++++++++++++ mm/mprotect.c | 12 +++++------- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 088a945bf26b..ee056c41a9fb 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -219,13 +219,12 @@ extern unsigned long pg0[]; * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT) static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } -static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } /* * The following only works if pte_present() is not true. @@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= __LARGE_PTE; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } #ifdef CONFIG_X86_PAE # include diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index e2560c58384b..5890972a69bf 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A)) #define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D)) #define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_D)) -#define pte_mkhuge(pte) (__pte(pte_val(pte) | _PAGE_P)) +#define pte_mkhuge(pte) (__pte(pte_val(pte))) /* * Macro to a page protection value as "uncacheable". Note that "protection" is really a diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 715fd94cf577..a617d364d08d 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } struct vm_area_struct; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 68d82ad6b17c..fa83836b63d2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot); #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 @@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void) #define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_change_protection(vma, address, end, newprot) + #ifndef HPAGE_MASK #define HPAGE_MASK PAGE_MASK /* Keep the compiler happy */ #define HPAGE_SIZE PAGE_SIZE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20117a4b8ab6..783098f6cf8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i; } + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&mm->page_table_lock); + for (; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (!pte_none(*ptep)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + lazy_mmu_prot_update(pte); + } + } + spin_unlock(&mm->page_table_lock); + + flush_tlb_range(vma, start, end); +} + diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1ed..4c14d4289b61 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -166,7 +166,10 @@ success: */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ -- cgit v1.2.3-71-gd317 From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:55 -0800 Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 74 ++++++++------------------ include/linux/hugetlb.h | 8 ++- mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 154 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b35195289945..1a1c2fcb7823 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } -/* - * huge_pages_needed tries to determine the number of new huge pages that - * will be required to fully populate this VMA. This will be equal to - * the size of the VMA in huge pages minus the number of huge pages - * (covered by this VMA) that are found in the page cache. - * - * Result is in bytes to be compatible with is_hugepage_mem_enough() - */ -static unsigned long -huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) -{ - int i; - struct pagevec pvec; - unsigned long start = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); - pgoff_t endpg = next + hugepages; - - pagevec_init(&pvec, 0); - while (next < endpg) { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - if (page->index > next) - next = page->index; - if (page->index >= endpg) - break; - next++; - hugepages--; - } - huge_pagevec_release(&pvec); - } - return hugepages << HPAGE_SHIFT; -} - static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long bytes; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; @@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; - bytes = huge_pages_needed(mapping, vma); - if (!is_hugepage_mem_enough(bytes)) - return -ENOMEM; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); mutex_lock(&inode->i_mutex); @@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; + if (vma->vm_flags & VM_MAYSHARE) + if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) + goto out; + ret = 0; hugetlb_prefault_arch_hook(vma->vm_mm); if (inode->i_size < len) @@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page) put_page(page); } -static void truncate_hugepages(struct address_space *mapping, loff_t lstart) +static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> HPAGE_SHIFT; struct pagevec pvec; pgoff_t next; int i; + hugetlb_truncate_reservation(HUGETLBFS_I(inode), + lstart >> HPAGE_SHIFT); + if (!mapping->nrpages) + return; pagevec_init(&pvec, 0); next = start; while (1) { @@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); } @@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); + truncate_hugepages(inode, 0); clear_inode(inode); destroy_inode(inode); } @@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); spin_unlock(&mapping->i_mmap_lock); - truncate_hugepages(mapping, offset); + truncate_hugepages(inode, offset); return 0; } @@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + p->prereserved_hpages = 0; return &p->vfs_inode; } @@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size) if (!can_do_hugetlb_shm()) return ERR_PTR(-EPERM); - if (!is_hugepage_mem_enough(size)) - return ERR_PTR(-ENOMEM); - if (!user_shm_lock(size, current->user)) return ERR_PTR(-ENOMEM); @@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size) if (!inode) goto out_file; + error = -ENOMEM; + if (hugetlb_extend_reservation(HUGETLBFS_I(inode), + size >> HPAGE_SHIFT) != 0) + goto out_inode; + d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; @@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size) file->f_mode = FMODE_WRITE | FMODE_READ; return file; +out_inode: + iput(inode); out_file: put_filp(file); out_dentry: diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fa83836b63d2..cafe73eecb05 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); -int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); void free_huge_page(struct page *); @@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() -#define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL @@ -132,6 +130,8 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { struct shared_policy policy; + /* Protected by the (global) hugetlb_lock */ + unsigned long prereserved_hpages; struct inode vfs_inode; }; @@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast_hpages); +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost_hpages); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a87bbe5..27fad5d9bcf6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -120,17 +120,136 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; + int use_reserve = 0; + unsigned long idx; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + spin_unlock(&hugetlb_lock); set_page_refcounted(page); return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); + + return ret; +} + +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) +{ + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; + struct page *page; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; + } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); } static int __init hugetlb_init(void) @@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { -- cgit v1.2.3-71-gd317 From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:56 -0800 Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local Originally, mm/hugetlb.c just handled the hugepage physical allocation path and its {alloc,free}_huge_page() functions were used from the arch specific hugepage code. These days those functions are only used with mm/hugetlb.c itself. Therefore, this patch makes them static and removes their prototypes from hugetlb.h. This requires a small rearrangement of code in mm/hugetlb.c to avoid a forward declaration. This patch causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ---- mm/hugetlb.c | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cafe73eecb05..5d84c368ffe4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); unsigned long hugetlb_total_pages(void); -struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); -void free_huge_page(struct page *); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access); @@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void) #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ do { } while (0) -#define alloc_huge_page(vma, addr) ({ NULL; }) -#define free_huge_page(p) ({ (void)(p); BUG(); }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27fad5d9bcf6..075877b1cbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + static int alloc_fresh_huge_page(void) { static int nid = 0; @@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void) return 0; } -void free_huge_page(struct page *page) -{ - BUG_ON(page_count(page)); - - INIT_LIST_HEAD(&page->lru); - - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); -} - -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; -- cgit v1.2.3-71-gd317 From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:57 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() free_pgtables() has special logic to call hugetlb_free_pgd_range() instead of the normal free_pgd_range() on hugepage VMAs. However, the test it uses to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized range at the start of the vma. is_hugepage_only_range() will return true if the given range has any intersection with a hugepage address region, and in this case the given region need not be hugepage aligned. So, for example, this test can return true if called on, say, a 4k VMA immediately preceding a (nicely aligned) hugepage VMA. At present we get away with this because the powerpc version of hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the only other arch with a non-trivial is_hugepage_only_range()) we get away with it for a different reason; the hugepage area is not contiguous with the rest of the user address space, and VMAs are not permitted in between, so the test can't return a false positive there. Nonetheless this should be fixed. We do that in the patch below by replacing the is_hugepage_only_range() test with an explicit test of the VMA using is_vm_hugetlb_page(). This in turn changes behaviour for platforms where is_hugepage_only_range() returns false always (everything except powerpc and ia64). We address this by ensuring that hugetlb_free_pgd_range() is defined to be identical to free_pgd_range() (instead of a no-op) on everything except ia64. Even so, it will prevent some otherwise possible coalescing of calls down to free_pgd_range(). Since this only happens for hugepage VMAs, removing this small optimization seems unlikely to cause any trouble. This patch causes no regressions on the libhugetlbfs testsuite - ppc64 POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP). Signed-off-by: David Gibson Cc: William Lee Irwin III Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/page.h | 1 + include/asm-powerpc/pgtable.h | 5 ----- include/linux/hugetlb.h | 9 +++++---- mm/memory.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 5e6362a786b7..732cf3086741 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,7 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ #ifdef __ASSEMBLY__ diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index e38931379a72..185ee15963a1 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -#ifdef CONFIG_HUGETLB_PAGE -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - free_pgd_range(tlb, addr, end, floor, ceiling) -#endif - /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5d84c368ffe4..e465fbf1ef5f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#endif + +#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE +#define hugetlb_free_pgd_range free_pgd_range #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE @@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void) #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) +#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/memory.c b/mm/memory.c index 71bc664efed5..f6e3be9cbf5a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(vma)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.2.3-71-gd317 From 3915bcf38fe0b6d130b4bbde97804f29a0becf32 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:59 -0800 Subject: [PATCH] hugepage: Move hugetlb_free_pgd_range() prototype to hugetlb.h The optional hugepage callback, hugetlb_free_pgd_range() is presently implemented non-trivially only on ia64 (but I plan to add one for powerpc shortly). It has its own prototype for the function in asm-ia64/pgtable.h. However, since the function is called from generic code, it make sense for its prototype to be in the generic hugetlb.h header file, as the protypes other arch callbacks already are (prepare_hugepage_range(), set_huge_pte_at(), etc.). This patch makes it so. Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/pgtable.h | 3 --- include/linux/hugetlb.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index 5890972a69bf..c0f8144f2349 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr; #define HUGETLB_PGDIR_SHIFT (HPAGE_SHIFT + 2*(PAGE_SHIFT-3)) #define HUGETLB_PGDIR_SIZE (__IA64_UL(1) << HUGETLB_PGDIR_SHIFT) #define HUGETLB_PGDIR_MASK (~(HUGETLB_PGDIR_SIZE-1)) -struct mmu_gather; -void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, - unsigned long end, unsigned long floor, unsigned long ceiling); #endif /* diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e465fbf1ef5f..5db25ffdb3eb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -47,6 +47,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE #define hugetlb_free_pgd_range free_pgd_range +#else +void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + unsigned long end, unsigned long floor, + unsigned long ceiling); #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -- cgit v1.2.3-71-gd317 From 42b88befd6e0dae1a5fe04c03925037fa890e1f3 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:09:01 -0800 Subject: [PATCH] hugepage: is_aligned_hugepage_range() cleanup Quite a long time back, prepare_hugepage_range() replaced is_aligned_hugepage_range() as the callback from mm/mmap.c to arch code to verify if an address range is suitable for a hugepage mapping. is_aligned_hugepage_range() stuck around, but only to implement prepare_hugepage_range() on archs which didn't implement their own. Most archs (everything except ia64 and powerpc) used the same implementation of is_aligned_hugepage_range(). On powerpc, which implements its own prepare_hugepage_range(), the custom version was never used. In addition, "is_aligned_hugepage_range()" was a bad name, because it suggests it returns true iff the given range is a good hugepage range, whereas in fact it returns 0-or-error (so the sense is reversed). This patch cleans up by abolishing is_aligned_hugepage_range(). Instead prepare_hugepage_range() is defined directly. Most archs use the default version, which simply checks the given region is aligned to the size of a hugepage. ia64 and powerpc define custom versions. The ia64 one simply checks that the range is in the correct address space region in addition to being suitably aligned. The powerpc version (just as previously) checks for suitable addresses, and if necessary performs low-level MMU frobbing to set up new areas for use by hugepages. No libhugetlbfs testsuite regressions on ppc64 (POWER5 LPAR). Signed-off-by: David Gibson Signed-off-by: Zhang Yanmin Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/hugetlbpage.c | 12 ------------ arch/ia64/mm/hugetlbpage.c | 5 +++-- arch/powerpc/mm/hugetlbpage.c | 15 --------------- arch/sh/mm/hugetlbpage.c | 12 ------------ arch/sh64/mm/hugetlbpage.c | 12 ------------ arch/sparc64/mm/hugetlbpage.c | 12 ------------ include/asm-ia64/page.h | 1 + include/linux/hugetlb.h | 16 ++++++++++++---- 8 files changed, 16 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index d524127c9afc..a7d891585411 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 2d13889d0a99..9dbc7dadd165 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* - * This function checks for proper alignment of input addr and len parameters. + * Don't actually need to do any preparation, but need to make sure + * the address is in the right region. */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len) { if (len & ~HPAGE_MASK) return -EINVAL; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b51bb28c054b..7370f9f33e29 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return __pte(old); } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - if (! (within_hugepage_low_range(addr, len) - || within_hugepage_high_range(addr, len)) ) - return -EINVAL; - return 0; -} - struct slb_flush_info { struct mm_struct *mm; u16 newareas; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 6b7a7688c98e..a3568fd51508 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index ed6a505b3ee2..3d89f2a6c785 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index a7a24869d045..280dc7958a13 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h index 732cf3086741..3ab27333dae4 100644 --- a/include/asm-ia64/page.h +++ b/include/asm-ia64/page.h @@ -57,6 +57,7 @@ # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA # define ARCH_HAS_HUGEPAGE_ONLY_RANGE +# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE # define ARCH_HAS_HUGETLB_FREE_PGD_RANGE #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5db25ffdb3eb..d6f1019625af 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -36,7 +36,6 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); int pmd_huge(pmd_t pmd); void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); @@ -54,8 +53,18 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, #endif #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE -#define prepare_hugepage_range(addr, len) \ - is_aligned_hugepage_range(addr, len) +/* + * If the arch doesn't supply something else, assume that hugepage + * size aligned regions are ok without further preparation. + */ +static inline int prepare_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} #else int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif @@ -95,7 +104,6 @@ static inline unsigned long hugetlb_total_pages(void) #define hugetlb_report_meminfo(buf) 0 #define hugetlb_report_node_meminfo(n, buf) 0 #define follow_huge_pmd(mm, addr, pmd, write) NULL -#define is_aligned_hugepage_range(addr, len) 0 #define prepare_hugepage_range(addr, len) (-EINVAL) #define pmd_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 -- cgit v1.2.3-71-gd317 From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:12 -0800 Subject: [PATCH] page migration reorg Centralize the page migration functions in anticipation of additional tinkering. Creates a new file mm/migrate.c 1. Extract buffer_migrate_page() from fs/buffer.c 2. Extract central migration code from vmscan.c 3. Extract some components from mempolicy.c 4. Export pageout() and remove_from_swap() from vmscan.c 5. Make it possible to configure NUMA systems without page migration and non-NUMA systems with page migration. I had to so some #ifdeffing in mempolicy.c that may need a cleanup. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 62 ----- fs/xfs/linux-2.6/xfs_buf.c | 1 + include/linux/migrate.h | 36 +++ include/linux/swap.h | 34 ++- mm/Kconfig | 6 + mm/Makefile | 2 + mm/mempolicy.c | 113 ++------ mm/migrate.c | 655 +++++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 + mm/vmscan.c | 491 +-------------------------------- 10 files changed, 741 insertions(+), 660 deletions(-) create mode 100644 include/linux/migrate.h create mode 100644 mm/migrate.c (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index a9b399402007..1d3683d496f8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3050,68 +3050,6 @@ asmlinkage long sys_bdflush(int func, long data) return 0; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -#ifdef CONFIG_MIGRATION -int buffer_migrate_page(struct page *newpage, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; - - if (!mapping) - return -EAGAIN; - - if (!page_has_buffers(page)) - return migrate_page(newpage, page); - - head = page_buffers(page); - - rc = migrate_page_remove_references(newpage, page, 3); - if (rc) - return rc; - - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); - - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; - - } while (bh != head); - - SetPagePrivate(newpage); - - migrate_page_copy(newpage, page); - - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return 0; -} -EXPORT_SYMBOL(buffer_migrate_page); -#endif - /* * Buffer-head allocation */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index bfb4f2917bb6..8cdfa4151659 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "xfs_linux.h" STATIC kmem_zone_t *xfs_buf_zone; diff --git a/include/linux/migrate.h b/include/linux/migrate.h new file mode 100644 index 000000000000..7d09962c3c0b --- /dev/null +++ b/include/linux/migrate.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_MIGRATE_H +#define _LINUX_MIGRATE_H + +#include +#include + +#ifdef CONFIG_MIGRATION +extern int isolate_lru_page(struct page *p, struct list_head *pagelist); +extern int putback_lru_pages(struct list_head *l); +extern int migrate_page(struct page *, struct page *); +extern void migrate_page_copy(struct page *, struct page *); +extern int migrate_page_remove_references(struct page *, struct page *, int); +extern int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed); +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest); +extern int fail_migrate_page(struct page *, struct page *); + +extern int migrate_prep(void); + +#else + +static inline int isolate_lru_page(struct page *p, struct list_head *list) + { return -ENOSYS; } +static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed) { return -ENOSYS; } + +static inline int migrate_prep(void) { return -ENOSYS; } + +/* Possible settings for the migrate_page() method in address_operations */ +#define migrate_page NULL +#define fail_migrate_page NULL + +#endif /* CONFIG_MIGRATION */ +#endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 3dc6c89c49b8..12415dd94451 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,6 +175,21 @@ extern void swap_setup(void); extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int remove_mapping(struct address_space *mapping, struct page *page); + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +extern pageout_t pageout(struct page *page, struct address_space *mapping); #ifdef CONFIG_NUMA extern int zone_reclaim_mode; @@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_MIGRATION -extern int isolate_lru_page(struct page *p); -extern unsigned long putback_lru_pages(struct list_head *l); -extern int migrate_page(struct page *, struct page *); -extern void migrate_page_copy(struct page *, struct page *); -extern int migrate_page_remove_references(struct page *, struct page *, int); -extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed); -extern int fail_migrate_page(struct page *, struct page *); -#else -static inline int isolate_lru_page(struct page *p) { return -ENOSYS; } -static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed) { return -ENOSYS; } -/* Possible settings for the migrate_page() method in address_operations */ -#define migrate_page NULL -#define fail_migrate_page NULL -#endif - #ifdef CONFIG_MMU /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae6409..bd80460360db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS # support for page migration # config MIGRATION + bool "Page migration" def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM depends on SWAP + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dcc3..f10c753dce6d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o + diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96195dcb62e1..e93cc740c22b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -95,9 +96,6 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -/* The number of pages to migrate per call to migrate_pages() */ -#define MIGRATE_CHUNK_SIZE 256 - static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return ERR_PTR(-ENODEV); - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); + err = migrate_prep(); + if (err) + return ERR_PTR(err); } first = find_vma(mm, start); @@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (isolate_lru_page(page)) - list_add_tail(&page->lru, pagelist); - } -} - -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -static int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); - } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); } /* @@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) return err; return busy; + } +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} +#endif + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) putback_lru_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 000000000000..09f6e4aa87fc --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,655 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#include "internal.h" + +/* The maximum number of pages to take off the LRU for migration */ +#define MIGRATE_CHUNK_SIZE 256 + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * Isolate one page from the LRU lists. If successful put it onto + * the indicated list with elevated page count. + * + * Result: + * -EBUSY: page not on LRU list + * 0: page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page, struct list_head *pagelist) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 0; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + list_add_tail(&page->lru, pagelist); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * migrate_prep() needs to be called after we have compiled the list of pages + * to be migrated using isolate_lru_page() but before we begin a series of calls + * to migrate_pages(). + */ +int migrate_prep(void) +{ + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return -ENODEV; + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) +{ + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; +} diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e1636..d7af296833fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include #include #include +#include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 548e023c193b..fd572bbdc9f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -42,18 +42,6 @@ #include "internal.h" -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping, * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -570,481 +558,6 @@ keep: return nr_reclaimed; } -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -unsigned long putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - unsigned long count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro - * Hirokazu Takahashi - * Dave Hansen - * Christoph Lameter - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; -} -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -unsigned long migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - unsigned long retry; - unsigned long nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - ret = 1; - get_page(page); - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; -} -#endif - /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages -- cgit v1.2.3-71-gd317