diff -r 322370200e6a innobase/include/os0file.h --- a/innobase/include/os0file.h Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/include/os0file.h Mon Nov 03 05:08:52 2008 -0800 @@ -532,21 +532,16 @@ FALSE otherwise */ const char* path); /* in: path name */ /**************************************************************************** -Initializes the asynchronous io system. Creates separate aio array for -non-ibuf read and write, a third aio array for the ibuf i/o, with just one -segment, two aio arrays for log reads and writes with one segment, and a -synchronous aio array of the specified size. The combined number of segments -in the three first aio arrays is the parameter n_segments given to the -function. The caller must create an i/o handler thread for each segment in -the four first arrays, but not for the sync aio array. */ +Initializes the asynchronous io system. */ -void +ulint os_aio_init( /*========*/ - ulint n, /* in: maximum number of pending aio operations - allowed; n must be divisible by n_segments */ - ulint n_segments, /* in: combined number of segments in the four - first aio arrays; must be >= 4 */ + /* out: number of AIO handler threads */ + ulint ios_per_array, /* in: maximum number of pending aio operations + allowed per IO array */ + ulint n_read_threads, /* in: number of read threads */ + ulint n_write_threads, /* in: number of write threads */ ulint n_slots_sync); /* in: number of slots in the sync aio array */ /*********************************************************************** Requests an asynchronous i/o operation. */ diff -r 322370200e6a innobase/include/srv0srv.h --- a/innobase/include/srv0srv.h Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800 @@ -87,6 +87,14 @@ extern ulint srv_lock_table_size; extern ulint srv_n_file_io_threads; +extern ulint srv_n_read_io_threads; +extern ulint srv_n_write_io_threads; + +/* Number of IO operations per second the server can do */ +extern ulint srv_io_capacity; + +/* Flush dirty pages when below max dirty percent */ +extern ibool srv_extra_dirty_writes; #ifdef UNIV_LOG_ARCHIVE extern ibool srv_log_archive_on; @@ -252,6 +260,24 @@ /* variable to count the number of random read-aheads were done */ extern ulint srv_read_ahead_rnd; + +/* Number of IO operations read/write done for all threads */ +extern ulint os_aio_read_requests; +extern ulint os_aio_write_requests; + +/* Number of pages read/written done for all threads */ +extern ulint os_aio_pages_read; +extern ulint os_aio_pages_written; + +/* time usec used to perform read/write for all threads */ +extern ib_longlong os_aio_read_time; +extern ib_longlong os_aio_write_time; + +extern ulint inno_pending_normal_aio_reads; +extern ulint inno_pending_normal_aio_writes; +extern ulint inno_pending_ibuf_aio_reads; +extern ulint inno_pending_log_ios; +extern ulint inno_pending_sync_ios; /* In this structure we store status variables to be passed to MySQL */ typedef struct export_var_struct export_struc; diff -r 322370200e6a innobase/log/log0log.c --- a/innobase/log/log0log.c Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/log/log0log.c Mon Nov 03 05:08:52 2008 -0800 @@ -1537,6 +1537,30 @@ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE, LOG_WRITE_FROM_BACKGROUND_SYNC); +} + +/******************************************************************** +Flush the log buffer. Force it to disk depending on the value of +innodb_flush_log_at_trx_commit. */ + +void +log_buffer_flush_maybe_sync(void) +/*==========================*/ +{ + dulint lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */ + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, + srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE, + srv_flush_log_at_trx_commit == 1 ? + LOG_WRITE_FROM_BACKGROUND_SYNC : + LOG_WRITE_FROM_BACKGROUND_ASYNC); } /******************************************************************** diff -r 322370200e6a innobase/os/os0file.c --- a/innobase/os/os0file.c Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/os/os0file.c Mon Nov 03 05:08:52 2008 -0800 @@ -22,6 +22,8 @@ #include #endif /* UNIV_HOTBACKUP */ +extern long innobase_max_merged_io; + #undef HAVE_FDATASYNC #ifdef POSIX_ASYNC_IO @@ -63,6 +65,28 @@ ibool os_aio_use_native_aio = FALSE; ibool os_aio_print_debug = FALSE; + +/* State for the state of an IO request in simulated AIO. + Protocol for simulated aio: + client requests IO: find slot with reserved = FALSE. Add entry with + status = OS_AIO_NOT_ISSUED. + IO thread wakes: find adjacent slots with reserved = TRUE and status = + OS_AIO_NOT_ISSUED. Change status for slots to + OS_AIO_ISSUED. + IO operation completes: set status for slots to OS_AIO_DONE. set status + for the first slot to OS_AIO_CLAIMED and return + result for that slot. + When there are multiple read and write threads, they all compete to execute + the requests in the array (os_aio_array_t). This avoids the need to load + balance requests at the time the request is made at the cost of waking all + threads when a request is available. +*/ +typedef enum { + OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */ + OS_AIO_ISSUED, /* Being processed by an IO thread. */ + OS_AIO_DONE, /* Request processed. */ + OS_AIO_CLAIMED /* Result being returned to client. */ +} os_aio_status; /* The aio array slot structure */ typedef struct os_aio_slot_struct os_aio_slot_t; @@ -72,6 +96,8 @@ ulint pos; /* index of the slot in the aio array */ ibool reserved; /* TRUE if this slot is reserved */ + os_aio_status status; /* Status for current request. Valid when reserved + is TRUE. Used only in simulated aio. */ time_t reservation_time;/* time when reserved */ ulint len; /* length of the block to read or write */ @@ -82,11 +108,6 @@ ulint offset_high; /* 32 high bits of file offset */ os_file_t file; /* file where to read or write */ const char* name; /* file name or path */ - ibool io_already_done;/* used only in simulated aio: - TRUE if the physical i/o already - made and only the slot message - needs to be passed to the caller - of os_aio_simulated_handle */ fil_node_t* message1; /* message which is given by the */ void* message2; /* the requester of an aio operation and which can be used to identify @@ -116,9 +137,6 @@ in this array */ ulint n_slots; /* Total number of slots in the aio array. This must be divisible by n_threads. */ - ulint n_segments;/* Number of segments in the aio array of - pending aio requests. A thread can wait - separately for any one of the segments. */ ulint n_reserved;/* Number of reserved slots in the aio array outside the ibuf segment */ os_aio_slot_t* slots; /* Pointer to the slots in the array */ @@ -134,6 +152,17 @@ /* Array of events used in simulated aio */ os_event_t* os_aio_segment_wait_events = NULL; + +/* Number of threads for reading and writing. */ +ulint os_aio_read_threads = 0; +ulint os_aio_write_threads = 0; + +/* Number for the first global segment for reading. */ +const ulint os_aio_first_read_segment = 2; + +/* Number for the first global segment for writing. Set to +2 + os_aio_read_write_threads. */ +ulint os_aio_first_write_segment = 0; /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These are NULL when the module has not yet been initialized. */ @@ -143,11 +172,39 @@ static os_aio_array_t* os_aio_log_array = NULL; static os_aio_array_t* os_aio_sync_array = NULL; +/* Per thread buffer used for merged IO requests. Used by +os_aio_simulated_handle so that a buffer doesn't have to be allocated +for each request. */ +static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS]; +static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS]; + +/* Count pages read and written per thread */ +static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS]; +static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS]; + +/* Number of IO operations done. One request can be for N pages. */ +static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS]; + +/* usecs spent blocked on an IO request */ +static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS]; +/* max usecs spent blocked on an IO request */ +static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS]; + +/* Number of IO global segments. An IO handler thread is created for each +global segment, except for the segment associated with os_aio_sync_array. +Several segments can be associated with os_aio_{read,write}_array. One +segment is created for each of the other arrays. This is also the number +of valid entries in srv_io_thread_reads, srv_io_thread_writes, +srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */ static ulint os_aio_n_segments = ULINT_UNDEFINED; -/* If the following is TRUE, read i/o handler threads try to -wait until a batch of new read requests have been posted */ -static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +/* Set to TRUE to temporarily block reads from being scheduled while a batch +of read requests is added to allow them to be merged by the IO handler thread +if they are adjacent. Declared volatile because we don't want this to be +read from a register in a loop when another thread may change the value in +memory. +*/ +static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE; ulint os_n_file_reads = 0; ulint os_bytes_read_since_printout = 0; @@ -166,6 +223,19 @@ ulint os_file_n_pending_pwrites = 0; ulint os_n_pending_writes = 0; ulint os_n_pending_reads = 0; + +/* TODO -- does InnoDB provide a portable method for this? */ +static double time_usecs() { +#ifdef __WIN__ + return 0.0; +#else + struct timeval tv; + if (gettimeofday(&tv, NULL)) + return 0; + else + return tv.tv_sec * 1000000.0 + tv.tv_usec; +#endif +} /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -1351,6 +1421,8 @@ /* We disable OS caching (O_DIRECT) only on data files */ if (type != OS_LOG_FILE && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + + fprintf(stderr, "Using O_DIRECT for file %s\n", name); os_file_set_nocache(file, name, mode_str); } @@ -1798,6 +1870,32 @@ #endif /* __WIN__ */ } +#ifndef __WIN__ +/*************************************************************************** +Possibly flushes a given file to disk. */ + +ibool +os_maybe_fsync( +/*==========*/ + /* out: 0 if success, error code otherwise */ + os_file_t file) /* in, own: handle to a file */ +{ + return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file); +} + +/*************************************************************************** +Possibly flushes a given file to disk. */ + +ibool +os_maybe_fdatasync( +/*==========*/ + /* out: 0 if success, error code otherwise */ + os_file_t file) /* in, own: handle to a file */ +{ + return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file); +} +#endif + /*************************************************************************** Flushes the write buffers of a given file to the disk. */ @@ -1855,21 +1953,21 @@ /* If we are not on an operating system that supports this, then fall back to a plain fsync. */ - ret = fsync(file); + ret = os_maybe_fsync(file); } else { ret = fcntl(file, F_FULLFSYNC, NULL); if (ret) { /* If we are not on a file system that supports this, then fall back to a plain fsync. */ - ret = fsync(file); + ret = os_maybe_fsync(file); } } #elif HAVE_FDATASYNC - ret = fdatasync(file); + ret = os_maybe_fdatasync(file); #else /* fprintf(stderr, "Flushing to file %p\n", file); */ - ret = fsync(file); + ret = os_maybe_fsync(file); #endif os_n_fsyncs++; @@ -2298,6 +2396,9 @@ return(TRUE); } + fprintf(stderr, +"InnoDB: error: os_file_pread wanted %lu and got %lu.\n", + (ulint) n, (ulint) ret); #endif #ifdef __WIN__ error_handling: @@ -2784,9 +2885,8 @@ os_aio_array_create( /*================*/ /* out, own: aio array */ - ulint n, /* in: maximum number of pending aio operations - allowed; n must be divisible by n_segments */ - ulint n_segments) /* in: number of segments in the aio array */ + ulint n) /* in: maximum number of pending aio operations + allowed */ { os_aio_array_t* array; ulint i; @@ -2795,7 +2895,6 @@ OVERLAPPED* over; #endif ut_a(n > 0); - ut_a(n_segments > 0); array = ut_malloc(sizeof(os_aio_array_t)); @@ -2806,7 +2905,6 @@ os_event_set(array->is_empty); array->n_slots = n; - array->n_segments = n_segments; array->n_reserved = 0; array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); #ifdef __WIN__ @@ -2833,70 +2931,75 @@ /**************************************************************************** Initializes the asynchronous io system. Calls also os_io_init_simple. -Creates a separate aio array for -non-ibuf read and write, a third aio array for the ibuf i/o, with just one -segment, two aio arrays for log reads and writes with one segment, and a -synchronous aio array of the specified size. The combined number of segments -in the three first aio arrays is the parameter n_segments given to the -function. The caller must create an i/o handler thread for each segment in -the four first arrays, but not for the sync aio array. */ - -void +Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO, +log IO, and synchronous IO. The caller must create i/o handler thread for all +but the synchronous aio array. Multiple threads can access the same array for +the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays. +Return the number of AIO handler threads. */ + +ulint os_aio_init( /*========*/ - ulint n, /* in: maximum number of pending aio operations - allowed; n must be divisible by n_segments */ - ulint n_segments, /* in: combined number of segments in the four - first aio arrays; must be >= 4 */ + ulint ios_per_array, /* in: maximum number of pending aio operations + allowed per array */ + ulint n_read_threads, /* in: number of read threads */ + ulint n_write_threads, /* in: number of write threads */ ulint n_slots_sync) /* in: number of slots in the sync aio array */ { - ulint n_read_segs; - ulint n_write_segs; - ulint n_per_seg; - ulint i; + ulint i; + ulint n_segments = 2 + n_read_threads + n_write_threads; #ifdef POSIX_ASYNC_IO sigset_t sigset; #endif - ut_ad(n % n_segments == 0); - ut_ad(n_segments >= 4); + ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD); + ut_a(n_read_threads >= 1 && n_read_threads <= 64); + ut_a(n_write_threads >= 1 && n_write_threads <= 64); + ut_a(n_segments < SRV_MAX_N_IO_THREADS); os_io_init_simple(); for (i = 0; i < n_segments; i++) { srv_set_io_thread_op_info(i, "not started yet"); - } - - n_per_seg = n / n_segments; - n_write_segs = (n_segments - 2) / 2; - n_read_segs = n_segments - 2 - n_write_segs; - - /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ - - os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + os_aio_thread_io_reads[i] = 0; + os_aio_thread_io_writes[i] = 0; + os_aio_thread_io_requests[i] = 0; + os_aio_thread_buffer[i] = 0; + os_aio_thread_buffer_size[i] = 0; + os_aio_thread_io_wait[i] = 0; + os_aio_thread_max_io_wait[i] = 0; + } + + os_aio_read_threads = n_read_threads; + os_aio_write_threads = n_write_threads; + os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads; + + fprintf(stderr, + "InnoDB: ios_per_array %lu read threads %lu write threads %lu\n", + ios_per_array, os_aio_read_threads, os_aio_write_threads); + + os_aio_ibuf_array = os_aio_array_create(ios_per_array); srv_io_thread_function[0] = "insert buffer thread"; - os_aio_log_array = os_aio_array_create(n_per_seg, 1); + os_aio_log_array = os_aio_array_create(ios_per_array); srv_io_thread_function[1] = "log thread"; - os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, - n_read_segs); - for (i = 2; i < 2 + n_read_segs; i++) { + os_aio_read_array = os_aio_array_create(ios_per_array); + for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); - srv_io_thread_function[i] = "read thread"; - } - - os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, - n_write_segs); - for (i = 2 + n_read_segs; i < n_segments; i++) { + srv_io_thread_function[i] = "read thread"; + } + + os_aio_write_array = os_aio_array_create(ios_per_array); + for (i = os_aio_first_write_segment; i < n_segments; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); - srv_io_thread_function[i] = "write thread"; - } - - os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); - - os_aio_n_segments = n_segments; + srv_io_thread_function[i] = "write thread"; + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync); + + os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads; os_aio_validate(); @@ -2924,6 +3027,7 @@ pthread_sigmask(SIG_BLOCK, &sigset, NULL); */ #endif + return os_aio_n_segments; } #ifdef WIN_ASYNC_IO @@ -2981,77 +3085,32 @@ os_event_wait(os_aio_write_array->is_empty); } -/************************************************************************** -Calculates segment number for a slot. */ -static -ulint -os_aio_get_segment_no_from_slot( -/*============================*/ - /* out: segment number (which is the number - used by, for example, i/o-handler threads) */ - os_aio_array_t* array, /* in: aio wait array */ - os_aio_slot_t* slot) /* in: slot in this array */ -{ - ulint segment; - ulint seg_len; - - if (array == os_aio_ibuf_array) { - segment = 0; - - } else if (array == os_aio_log_array) { - segment = 1; - - } else if (array == os_aio_read_array) { - seg_len = os_aio_read_array->n_slots / - os_aio_read_array->n_segments; - - segment = 2 + slot->pos / seg_len; - } else { - ut_a(array == os_aio_write_array); - seg_len = os_aio_write_array->n_slots / - os_aio_write_array->n_segments; - - segment = os_aio_read_array->n_segments + 2 - + slot->pos / seg_len; - } - - return(segment); -} - -/************************************************************************** -Calculates local segment number and aio array from global segment number. */ -static -ulint -os_aio_get_array_and_local_segment( + +/************************************************************************** +Calculates aio array from global segment number. */ +static +os_aio_array_t* +os_aio_get_array( /*===============================*/ - /* out: local segment number within - the aio array */ - os_aio_array_t** array, /* out: aio wait array */ + /* out: aio wait array */ ulint global_segment)/* in: global segment number */ { - ulint segment; ut_a(global_segment < os_aio_n_segments); if (global_segment == 0) { - *array = os_aio_ibuf_array; - segment = 0; + return os_aio_ibuf_array; } else if (global_segment == 1) { - *array = os_aio_log_array; - segment = 0; - - } else if (global_segment < os_aio_read_array->n_segments + 2) { - *array = os_aio_read_array; - - segment = global_segment - 2; - } else { - *array = os_aio_write_array; - - segment = global_segment - (os_aio_read_array->n_segments + 2); - } - - return(segment); + return os_aio_log_array; + + } else if (global_segment < os_aio_first_write_segment) { + return os_aio_read_array; + + } else { + return os_aio_write_array; + + } } /*********************************************************************** @@ -3160,7 +3219,7 @@ os_aio_simulated_wake_handler_threads(); } - + os_event_wait(array->not_full); goto loop; @@ -3173,7 +3232,7 @@ break; } } - + ut_a(i < array->n_slots); array->n_reserved++; if (array->n_reserved == 1) { @@ -3195,7 +3254,7 @@ slot->buf = buf; slot->offset = offset; slot->offset_high = offset_high; - slot->io_already_done = FALSE; + slot->status = OS_AIO_NOT_ISSUED; #ifdef WIN_ASYNC_IO control = &(slot->control); @@ -3246,8 +3305,9 @@ os_mutex_enter(array->mutex); ut_ad(slot->reserved); - + slot->reserved = FALSE; + slot->status = OS_AIO_NOT_ISSUED; array->n_reserved--; @@ -3266,36 +3326,40 @@ } /************************************************************************** -Wakes up a simulated aio i/o-handler thread if it has something to do. */ +Wake up the simulated aio i/o-handler threads for a given array if there +is work to do. */ static void os_aio_simulated_wake_handler_thread( /*=================================*/ - ulint global_segment) /* in: the number of the segment in the aio - arrays */ -{ - os_aio_array_t* array; - os_aio_slot_t* slot; - ulint segment; + os_aio_array_t* array) /* in: aio array for which wakeup is done */ +{ + os_aio_slot_t* slot; ulint n; ulint i; ut_ad(!os_aio_use_native_aio); - segment = os_aio_get_array_and_local_segment(&array, global_segment); - - n = array->n_slots / array->n_segments; - - /* Look through n slots after the segment * n'th slot */ - - os_mutex_enter(array->mutex); - - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); - - if (slot->reserved) { - /* Found an i/o request */ - + n = array->n_slots; + + /* Look through n slots */ + + os_mutex_enter(array->mutex); + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i ); + + if (slot->reserved && + (slot->status == OS_AIO_NOT_ISSUED || + slot->status == OS_AIO_DONE)) { + /* Found an i/o request + /* OS_AIO_NOT_ISSUED means the read or write request has + * yet to be done. OS_AIO_DONE means the request has been + * done but it was part of a set of requests merged into + * one read or write call and was not the first block in + * the request, so the handling of the IO completion for + * that block has not been done. */ + break; } } @@ -3303,7 +3367,25 @@ os_mutex_exit(array->mutex); if (i < n) { - os_event_set(os_aio_segment_wait_events[global_segment]); + if (array == os_aio_ibuf_array) { + os_event_set(os_aio_segment_wait_events[0]); + + } else if (array == os_aio_log_array) { + os_event_set(os_aio_segment_wait_events[1]); + + } else if (array == os_aio_read_array) { + ulint x; + for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++) + os_event_set(os_aio_segment_wait_events[x]); + + } else if (array == os_aio_write_array) { + ulint x; + for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++) + os_event_set(os_aio_segment_wait_events[x]); + + } else { + ut_a(0); + } } } @@ -3320,13 +3402,14 @@ /* We do not use simulated aio: do nothing */ return; - } - - os_aio_recommend_sleep_for_read_threads = FALSE; - - for (i = 0; i < os_aio_n_segments; i++) { - os_aio_simulated_wake_handler_thread(i); - } + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + os_aio_simulated_wake_handler_thread(os_aio_ibuf_array); + os_aio_simulated_wake_handler_thread(os_aio_log_array); + os_aio_simulated_wake_handler_thread(os_aio_read_array); + os_aio_simulated_wake_handler_thread(os_aio_write_array); } /************************************************************************** @@ -3339,18 +3422,13 @@ os_aio_simulated_put_read_threads_to_sleep(void) /*============================================*/ { - os_aio_array_t* array; ulint g; + /* TODO(mcallaghan): provide similar function for write? */ os_aio_recommend_sleep_for_read_threads = TRUE; - for (g = 0; g < os_aio_n_segments; g++) { - os_aio_get_array_and_local_segment(&array, g); - - if (array == os_aio_read_array) { - - os_event_reset(os_aio_segment_wait_events[g]); - } + for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) { + os_event_reset(os_aio_segment_wait_events[g]); } } @@ -3480,8 +3558,7 @@ #endif } else { if (!wake_later) { - os_aio_simulated_wake_handler_thread( - os_aio_get_segment_no_from_slot(array, slot)); + os_aio_simulated_wake_handler_thread(array); } } } else if (type == OS_FILE_WRITE) { @@ -3497,8 +3574,7 @@ #endif } else { if (!wake_later) { - os_aio_simulated_wake_handler_thread( - os_aio_get_segment_no_from_slot(array, slot)); + os_aio_simulated_wake_handler_thread(array); } } } else { @@ -3561,7 +3637,7 @@ os_aio_windows_handle( /*==================*/ /* out: TRUE if the aio operation succeeded */ - ulint segment, /* in: the number of the segment in the aio + ulint global_segment, /* in: the number of the segment in the aio arrays to wait for; segment 0 is the ibuf i/o thread, segment 1 the log i/o thread, then follow the non-ibuf read threads, and as @@ -3579,7 +3655,6 @@ void** message2, ulint* type) /* out: OS_FILE_WRITE or ..._READ */ { - ulint orig_seg = segment; os_aio_array_t* array; os_aio_slot_t* slot; ulint n; @@ -3588,33 +3663,30 @@ BOOL ret; DWORD len; - if (segment == ULINT_UNDEFINED) { + if (global_segment == ULINT_UNDEFINED) { array = os_aio_sync_array; - segment = 0; - } else { - segment = os_aio_get_array_and_local_segment(&array, segment); + } else { + array = os_aio_get_array(global_segment); } /* NOTE! We only access constant fields in os_aio_array. Therefore we do not have to acquire the protecting mutex yet */ ut_ad(os_aio_validate()); - ut_ad(segment < array->n_segments); - - n = array->n_slots / array->n_segments; + + n = array->n_slots; if (array == os_aio_sync_array) { os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); i = pos; } else { - srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); - i = os_event_wait_multiple(n, - (array->native_events) + segment * n); - } - - os_mutex_enter(array->mutex); - - slot = os_aio_array_get_nth_slot(array, i + segment * n); + srv_set_io_thread_op_info(global_segment, "wait Windows aio"); + i = os_event_wait_multiple(n, (array->native_events)); + } + + os_mutex_enter(array->mutex); + + slot = os_aio_array_get_nth_slot(array, i); ut_a(slot->reserved); @@ -3787,14 +3859,16 @@ ulint* type) /* out: OS_FILE_WRITE or ..._READ */ { os_aio_array_t* array; - ulint segment; os_aio_slot_t* slot; os_aio_slot_t* slot2; os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + os_aio_slot_t* lowest_request; + os_aio_slot_t* oldest_request; ulint n_consecutive; ulint total_len; ulint offs; ulint lowest_offset; + ulint oldest_offset; ulint biggest_age; ulint age; byte* combined_buf; @@ -3802,8 +3876,10 @@ ibool ret; ulint n; ulint i; - - segment = os_aio_get_array_and_local_segment(&array, global_segment); + + double start_usecs, stop_usecs, elapsed_usecs; + time_t now; + array = os_aio_get_array(global_segment); restart: /* NOTE! We only access constant fields in os_aio_array. Therefore @@ -3812,11 +3888,10 @@ srv_set_io_thread_op_info(global_segment, "looking for i/o requests (a)"); ut_ad(os_aio_validate()); - ut_ad(segment < array->n_segments); - - n = array->n_slots / array->n_segments; - - /* Look through n slots after the segment * n'th slot */ + + n = array->n_slots; + + /* Look through n slots */ if (array == os_aio_read_array && os_aio_recommend_sleep_for_read_threads) { @@ -3836,9 +3911,9 @@ done */ for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); - - if (slot->reserved && slot->io_already_done) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved && slot->status == OS_AIO_DONE) { if (os_aio_print_debug) { fprintf(stderr, @@ -3846,79 +3921,66 @@ } ret = TRUE; - + goto slot_io_done; } } - n_consecutive = 0; - - /* If there are at least 2 seconds old requests, then pick the oldest - one to prevent starvation. If several requests have the same age, - then pick the one at the lowest offset. */ - biggest_age = 0; - lowest_offset = ULINT_MAX; - - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); - - if (slot->reserved) { - age = (ulint)difftime(time(NULL), - slot->reservation_time); - + now = time(NULL); + oldest_request = lowest_request = NULL; + oldest_offset = lowest_offset = ULINT_MAX; + + /* Find the oldest request and the request with the smallest offset */ + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) { + age = (ulint)difftime(now, slot->reservation_time); + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ if ((age >= 2 && age > biggest_age) || (age >= 2 && age == biggest_age - && slot->offset < lowest_offset)) { + && slot->offset < oldest_offset)) { /* Found an i/o request */ - consecutive_ios[0] = slot; - - n_consecutive = 1; - biggest_age = age; + oldest_request = slot; + oldest_offset = slot->offset; + } + + /* Look for an i/o request at the lowest offset in the array + * (we ignore the high 32 bits of the offset) */ + if (slot->offset < lowest_offset) { + /* Found an i/o request */ + lowest_request = slot; lowest_offset = slot->offset; } } } - if (n_consecutive == 0) { - /* There were no old requests. Look for an i/o request at the - lowest offset in the array (we ignore the high 32 bits of the - offset in these heuristics) */ - - lowest_offset = ULINT_MAX; - - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, - i + segment * n); - - if (slot->reserved && slot->offset < lowest_offset) { - - /* Found an i/o request */ - consecutive_ios[0] = slot; - - n_consecutive = 1; - - lowest_offset = slot->offset; - } - } - } - - if (n_consecutive == 0) { + if (!lowest_request && !oldest_request) { /* No i/o requested at the moment */ goto wait_for_io; } - slot = consecutive_ios[0]; + if (oldest_request) { + slot = oldest_request; + } else { + slot = lowest_request; + } + consecutive_ios[0] = slot; + n_consecutive = 1; /* Check if there are several consecutive blocks to read or write */ consecutive_loop: for (i = 0; i < n; i++) { - slot2 = os_aio_array_get_nth_slot(array, i + segment * n); + slot2 = os_aio_array_get_nth_slot(array, i); if (slot2->reserved && slot2 != slot && slot2->offset == slot->offset + slot->len @@ -3926,7 +3988,8 @@ sum does not wrap over */ && slot2->offset_high == slot->offset_high && slot2->type == slot->type - && slot2->file == slot->file) { + && slot2->file == slot->file + && slot2->status == OS_AIO_NOT_ISSUED) { /* Found a consecutive i/o request */ @@ -3935,7 +3998,8 @@ slot = slot2; - if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE + && n_consecutive < innobase_max_merged_io) { goto consecutive_loop; } else { @@ -3955,6 +4019,8 @@ for (i = 0; i < n_consecutive; i++) { total_len += consecutive_ios[i]->len; + ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED); + consecutive_ios[i]->status = OS_AIO_ISSUED; } if (n_consecutive == 1) { @@ -3962,7 +4028,16 @@ combined_buf = slot->buf; combined_buf2 = NULL; } else { - combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); + if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) { + + if (os_aio_thread_buffer[global_segment]) + ut_free(os_aio_thread_buffer[global_segment]); + + os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE); + + os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE; + } + combined_buf2 = os_aio_thread_buffer[global_segment]; ut_a(combined_buf2); @@ -3973,6 +4048,9 @@ this assumes that there is just one i/o-handler thread serving a single segment of slots! */ + ut_a(slot->reserved); + ut_a(slot->status == OS_AIO_ISSUED); + os_mutex_exit(array->mutex); if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { @@ -3998,6 +4076,7 @@ /* Do the i/o with ordinary, synchronous i/o functions: */ if (slot->type == OS_FILE_WRITE) { + os_aio_thread_io_writes[global_segment] += n_consecutive; if (array == os_aio_write_array) { if ((total_len % UNIV_PAGE_SIZE != 0) || (slot->offset % UNIV_PAGE_SIZE != 0)) { @@ -4012,16 +4091,34 @@ os_file_check_page_trailers(combined_buf, total_len); } + start_usecs = time_usecs(); ret = os_file_write(slot->name, slot->file, combined_buf, slot->offset, slot->offset_high, total_len); - + stop_usecs = time_usecs(); + elapsed_usecs = stop_usecs - start_usecs; + if (elapsed_usecs < 0) elapsed_usecs = 0; if (array == os_aio_write_array) { os_file_check_page_trailers(combined_buf, total_len); } - } else { + os_aio_write_requests++; + os_aio_pages_written += n_consecutive; + os_aio_write_time += (ib_longlong)elapsed_usecs; + } else { + start_usecs = time_usecs(); + os_aio_thread_io_reads[global_segment] += n_consecutive; ret = os_file_read(slot->file, combined_buf, slot->offset, slot->offset_high, total_len); - } + stop_usecs = time_usecs(); + elapsed_usecs = stop_usecs - start_usecs; + if (elapsed_usecs < 0) elapsed_usecs = 0; + os_aio_read_requests++; + os_aio_pages_read += n_consecutive; + os_aio_read_time += (ib_longlong)elapsed_usecs; + } + if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment]) + os_aio_thread_max_io_wait[global_segment] = elapsed_usecs; + os_aio_thread_io_wait[global_segment] += elapsed_usecs; + os_aio_thread_io_requests[global_segment]++; ut_a(ret); srv_set_io_thread_op_info(global_segment, "file i/o done"); @@ -4042,16 +4139,13 @@ } } - if (combined_buf2) { - ut_free(combined_buf2); - } - os_mutex_enter(array->mutex); /* Mark the i/os done in slots */ for (i = 0; i < n_consecutive; i++) { - consecutive_ios[i]->io_already_done = TRUE; + ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED); + consecutive_ios[i]->status = OS_AIO_DONE; } /* We return the messages for the first slot now, and if there were @@ -4061,6 +4155,8 @@ slot_io_done: ut_a(slot->reserved); + ut_a(slot->status == OS_AIO_DONE); + slot->status = OS_AIO_CLAIMED; *message1 = slot->message1; *message2 = slot->message2; @@ -4070,7 +4166,8 @@ os_mutex_exit(array->mutex); os_aio_array_free_slot(array, slot); - + srv_set_io_thread_op_info(global_segment, "exited handler"); + return(ret); wait_for_io: @@ -4115,7 +4212,6 @@ os_mutex_enter(array->mutex); ut_a(array->n_slots > 0); - ut_a(array->n_segments > 0); for (i = 0; i < array->n_slots; i++) { slot = os_aio_array_get_nth_slot(array, i); @@ -4165,11 +4261,20 @@ double time_elapsed; double avg_bytes_read; ulint i; - - for (i = 0; i < srv_n_file_io_threads; i++) { - fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, - srv_io_thread_op_info[i], - srv_io_thread_function[i]); + ulint num_issued, num_done, num_claimed; + + if (file) { + for (i = 0; i < os_aio_n_segments; i++) { + fprintf(file, + "I/O thread %lu state: %s (%s) reads %lu writes %lu " + "requests %lu io secs %lf io msecs/request %lf max_io_wait %lf", + i, srv_io_thread_op_info[i], srv_io_thread_function[i], + os_aio_thread_io_reads[i], os_aio_thread_io_writes[i], + os_aio_thread_io_requests[i], + os_aio_thread_io_wait[i] / 1000000.0, + os_aio_thread_io_requests[i] ? + os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0, + os_aio_thread_max_io_wait[i] / 1000.0); #ifndef __WIN__ if (os_aio_segment_wait_events[i]->is_set) { @@ -4181,6 +4286,7 @@ } fputs("Pending normal aio reads:", file); + } // if (file) array = os_aio_read_array; loop: @@ -4189,14 +4295,23 @@ os_mutex_enter(array->mutex); ut_a(array->n_slots > 0); - ut_a(array->n_segments > 0); n_reserved = 0; + num_done = num_issued = num_claimed = 0; for (i = 0; i < array->n_slots; i++) { slot = os_aio_array_get_nth_slot(array, i); if (slot->reserved) { + if (slot->status == OS_AIO_ISSUED) + num_issued++; + else if (slot->status == OS_AIO_DONE) + num_done++; + else { + ut_ad(slot->status == OS_AIO_CLAIMED); + num_claimed++; + } + n_reserved++; /* fprintf(stderr, "Reserved slot, messages %p %p\n", slot->message1, slot->message2); */ @@ -4206,42 +4321,56 @@ ut_a(array->n_reserved == n_reserved); - fprintf(file, " %lu", (ulong) n_reserved); - + if (file) fprintf(file, " %lu", (ulong) n_reserved); + os_mutex_exit(array->mutex); if (array == os_aio_read_array) { - fputs(", aio writes:", file); - + inno_pending_normal_aio_reads = (ulong) n_reserved; + if (file) fputs(", aio writes:", file); array = os_aio_write_array; goto loop; } if (array == os_aio_write_array) { - fputs(",\n ibuf aio reads:", file); + inno_pending_normal_aio_writes = (ulong) n_reserved; + if (file) fputs(",\n ibuf aio reads:", file); array = os_aio_ibuf_array; goto loop; } if (array == os_aio_ibuf_array) { - fputs(", log i/o's:", file); + inno_pending_ibuf_aio_reads = (ulong) n_reserved; + if (file) fputs(", log i/o's:", file); array = os_aio_log_array; goto loop; } if (array == os_aio_log_array) { - fputs(", sync i/o's:", file); + inno_pending_log_ios = (ulong) n_reserved; + if (file) fputs(", sync i/o's:", file); array = os_aio_sync_array; goto loop; } - putc('\n', file); + if (array == os_aio_sync_array) { + inno_pending_sync_ios = (ulong) n_reserved; + } + current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + if (file) { + putc('\n', file); + fprintf(file, + "Summary of background IO slot status: %lu issued, " + "%lu done, %lu claimed, sleep set %d\n", + num_issued, num_done, num_claimed, + os_aio_recommend_sleep_for_read_threads); fprintf(file, "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" @@ -4274,6 +4403,7 @@ / time_elapsed, (os_n_fsyncs - os_n_fsyncs_old) / time_elapsed); + } // if (file) os_n_file_reads_old = os_n_file_reads; os_n_file_writes_old = os_n_file_writes; diff -r 322370200e6a innobase/srv/srv0srv.c --- a/innobase/srv/srv0srv.c Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800 @@ -164,7 +164,17 @@ ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */ ulint srv_lock_table_size = ULINT_MAX; +ulint srv_io_capacity = ULINT_MAX; /* Number of IO operations per + second the server can do */ + +ibool srv_extra_dirty_writes = TRUE; /* Write dirty pages to disk when pct + dirty < max dirty pct */ + +/* Deprecated by srv_n_{read,write}_io_threads */ ulint srv_n_file_io_threads = ULINT_MAX; +/* Number of background IO threads for read and write requests */ +ulint srv_n_read_io_threads = ULINT_MAX; +ulint srv_n_write_io_threads = ULINT_MAX; #ifdef UNIV_LOG_ARCHIVE ibool srv_log_archive_on = FALSE; @@ -238,6 +248,24 @@ /* variable to count the number of random read-aheads */ ulint srv_read_ahead_rnd = 0; + +/* Number of IO operations read/write done for all threads */ +ulint os_aio_read_requests = 0; +ulint os_aio_write_requests = 0; + +/* Number of pages read/written done for all threads */ +ulint os_aio_pages_read = 0; +ulint os_aio_pages_written = 0; + +/* time usec used to perform read/write for all threads */ +ib_longlong os_aio_read_time = 0; +ib_longlong os_aio_write_time = 0; + +ulint inno_pending_normal_aio_reads = 0; +ulint inno_pending_normal_aio_writes = 0; +ulint inno_pending_ibuf_aio_reads = 0; +ulint inno_pending_log_ios = 0; +ulint inno_pending_sync_ios = 0; /* structure to pass status variables to MySQL */ export_struc export_vars; @@ -413,6 +441,23 @@ ulint srv_main_thread_process_no = 0; ulint srv_main_thread_id = 0; + +// The following count work done by srv_master_thread. + +// Iterations by the 'once per second' loop. +ulint srv_main_1_second_loops = 0; +// Calls to sleep by the 'once per second' loop. +ulint srv_main_sleeps = 0; +// Iterations by the 'once per 10 seconds' loop. +ulint srv_main_10_second_loops = 0; +// Iterations of the loop bounded by the 'background_loop' label. +ulint srv_main_background_loops = 0; +// Iterations of the loop bounded by the 'flush_loop' label. +ulint srv_main_flush_loops = 0; +// Calls to log_buffer_flush_to_disk. +ulint srv_sync_flush = 0; +// Calls to log_buffer_flush_maybe_sync. +ulint srv_async_flush = 0; /* IMPLEMENTATION OF THE SERVER MAIN PROGRAM @@ -2170,7 +2215,12 @@ } /************************************************************************* -The master thread controlling the server. */ +Returns the number of IO operations that is X percent of the capacity. + +PCT_IO(5) -> returns the number of IO operations that is 5% of the max +where max is srv_io_capacity. +*/ +#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0))) #ifndef __WIN__ void* @@ -2199,11 +2249,15 @@ ulint n_pend_ios; ibool skip_sleep = FALSE; ulint i; + #ifdef UNIV_DEBUG_THREAD_CREATION fprintf(stderr, "Master thread starts, id %lu\n", os_thread_pf(os_thread_get_curr_id())); #endif + fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n", + srv_io_capacity); + srv_main_thread_process_no = os_proc_get_number(); srv_main_thread_id = os_thread_pf(os_thread_get_curr_id()); @@ -2275,26 +2329,28 @@ srv_main_thread_op_info = "flushing log"; log_buffer_flush_to_disk(); + srv_sync_flush++; srv_main_thread_op_info = "making checkpoint"; log_free_check(); - /* If there were less than 5 i/os during the - one second sleep, we assume that there is free - disk i/o capacity available, and it makes sense to - do an insert buffer merge. */ + /* If i/os during one second sleep were less than 5% of + capacity, we assume that there is free disk i/o capacity + available, and it makes sense to do an insert buffer merge. */ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; - if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) { + if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) { srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(TRUE, 5); + ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); + srv_async_flush++; } if (buf_get_modified_ratio_pct() > @@ -2303,7 +2359,8 @@ /* Try to keep the number of modified pages in the buffer pool under the limit wished by the user */ - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, + PCT_IO(100), ut_dulint_max); /* If we had to do the flush, it may have taken @@ -2325,36 +2382,47 @@ /* ---- We perform the following code approximately once per 10 seconds when there is database activity */ + srv_main_10_second_loops++; #ifdef MEM_PERIODIC_CHECK /* Check magic numbers of every allocated mem block once in 10 seconds */ mem_validate_all_blocks(); #endif - /* If there were less than 200 i/os during the 10 second period, - we assume that there is free disk i/o capacity available, and it - makes sense to flush 100 pages. */ + /* If i/os during the 10 second period were less than 200% of + capacity, we assume that there is free disk i/o capacity + available, and it makes sense to flush srv_io_capacity pages. + + Note that this is done regardless of the fraction of dirty + pages relative to the max requested by the user. The one second + loop above requests writes for that case. The writes done here + are not required, and may be disabled. */ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes; n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; - if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) { + if (srv_extra_dirty_writes && + n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) { srv_main_thread_op_info = "flushing buffer pool pages"; - buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); + buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); + srv_async_flush++; } /* We run a batch of insert buffer merge every 10 seconds, even if the server were active */ srv_main_thread_op_info = "doing insert buffer merge"; - ibuf_contract_for_n_pages(TRUE, 5); + ibuf_contract_for_n_pages(TRUE, PCT_IO(5)); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); + srv_async_flush++; /* We run a full purge every 10 seconds, even if the server were active */ @@ -2378,8 +2446,9 @@ if (difftime(current_time, last_flush_time) > 1) { srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + log_buffer_flush_to_disk(); last_flush_time = current_time; + srv_sync_flush++; } } @@ -2393,14 +2462,14 @@ (> 70 %), we assume we can afford reserving the disk(s) for the time it requires to flush 100 pages */ - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); } else { /* Otherwise, we only flush a small number of pages so that we do not unnecessarily use much disk i/o capacity from other work */ - n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10), ut_dulint_max); } @@ -2434,7 +2503,7 @@ /* The server has been quiet for a while: start running background operations */ - + srv_main_background_loops++; srv_main_thread_op_info = "doing background drop tables"; n_tables_to_drop = row_drop_tables_for_mysql_in_background(); @@ -2472,6 +2541,7 @@ log_buffer_flush_to_disk(); last_flush_time = current_time; + srv_sync_flush++; } } @@ -2487,9 +2557,13 @@ srv_main_thread_op_info = "doing insert buffer merge"; if (srv_fast_shutdown && srv_shutdown_state > 0) { - n_bytes_merged = 0; + n_bytes_merged = 0; } else { - n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20); + /* This should do an amount of IO similar to the number of + * dirty pages that will be flushed in the call to + * buf_flush_batch below. Otherwise, the system favors + * clean pages over cleanup throughput. */ + n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100)); } srv_main_thread_op_info = "reserving kernel mutex"; @@ -2503,10 +2577,11 @@ flush_loop: srv_main_thread_op_info = "flushing buffer pool pages"; + srv_main_flush_loops++; if (srv_fast_shutdown < 2) { n_pages_flushed = - buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max); + buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max); } else { /* In the fastest shutdown we do not flush the buffer pool to data files: we set n_pages_flushed to 0 artificially. */ @@ -2528,7 +2603,17 @@ srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + current_time = time(NULL); + if (difftime(current_time, last_flush_time) > 1) { + srv_main_thread_op_info = (char*) "flushing log"; + log_buffer_flush_to_disk(); + last_flush_time = current_time; + srv_sync_flush++; + } else { + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); + srv_async_flush++; + } srv_main_thread_op_info = "making checkpoint"; diff -r 322370200e6a innobase/srv/srv0start.c --- a/innobase/srv/srv0start.c Mon Nov 03 05:07:57 2008 -0800 +++ b/innobase/srv/srv0start.c Mon Nov 03 05:08:52 2008 -0800 @@ -973,6 +973,7 @@ ulint i; ibool srv_file_per_table_original_value = srv_file_per_table; mtr_t mtr; + ulint n_threads; #ifdef HAVE_DARWIN_THREADS # ifdef F_FULLFSYNC /* This executable has been compiled on Mac OS X 10.3 or later. @@ -1206,24 +1207,32 @@ } /* Restrict the maximum number of file i/o threads */ - if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) { - - srv_n_file_io_threads = SRV_MAX_N_IO_THREADS; + if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) { + fprintf(stderr, + "InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n", + srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS); + return(DB_ERROR); } if (!os_aio_use_native_aio) { - /* In simulated aio we currently have use only for 4 threads */ - srv_n_file_io_threads = 4; + /* More than 4 threads are now supported. */ + n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD, + srv_n_read_io_threads, + srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + } else { + /* Might need more slots here. Alas, I don't do windows. */ + n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD, + srv_n_read_io_threads, + srv_n_write_io_threads, + SRV_MAX_N_PENDING_SYNC_IOS); + } - os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD - * srv_n_file_io_threads, - srv_n_file_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS); - } else { - os_aio_init(SRV_N_PENDING_IOS_PER_THREAD - * srv_n_file_io_threads, - srv_n_file_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS); + if (n_threads > SRV_MAX_N_IO_THREADS) { + fprintf(stderr, + "InnoDB: requested too many IO threads(%d), max is %d\n", + n_threads, SRV_MAX_N_IO_THREADS); + return(DB_ERROR); } fil_init(srv_max_n_open_files); @@ -1259,11 +1268,11 @@ /* Create i/o-handler threads: */ - for (i = 0; i < srv_n_file_io_threads; i++) { + for (i = 0; i < n_threads; i++) { n[i] = i; os_thread_create(io_handler_thread, n + i, thread_ids + i); - } + } #ifdef UNIV_LOG_ARCHIVE if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) { diff -r 322370200e6a patch_info/innodb_io_tune.info --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patch_info/innodb_io_tune.info Mon Nov 03 05:08:52 2008 -0800 @@ -0,0 +1,9 @@ +File=innodb_io_tune.patch +Name=Tune InnoDB IO settings +Version=1.0 +Author=Google +License=GPL +Comment= +ChangeLog= +2008-11-01 +VT: Initial porting diff -r 322370200e6a sql/ha_innodb.cc --- a/sql/ha_innodb.cc Mon Nov 03 05:07:57 2008 -0800 +++ b/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800 @@ -147,7 +147,7 @@ innobase_additional_mem_pool_size, innobase_file_io_threads, innobase_lock_wait_timeout, innobase_force_recovery, innobase_open_files; - +long innobase_read_io_threads, innobase_write_io_threads; longlong innobase_buffer_pool_size, innobase_log_file_size; /* The default values for the following char* start-up parameters @@ -175,6 +175,23 @@ my_bool innobase_rollback_on_timeout = FALSE; my_bool innobase_create_status_file = FALSE; my_bool innobase_adaptive_hash_index = TRUE; + +/* Max number of IO requests merged to perform large IO in background + IO threads. +*/ +long innobase_max_merged_io = 64; + +/* time interval in seconds allowed to calling innodb_show_status functions */ +long innobase_min_status_update_time_interval = 30; + + +/* Default number of IO per second supported by server. Tunes background + IO rate +*/ +long innobase_io_capacity = 100; + +/* Write dirty pages when pct dirty is less than max pct dirty */ +my_bool innobase_extra_dirty_writes = TRUE; static char *internal_innobase_data_file_path = NULL; @@ -1372,7 +1389,11 @@ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + srv_io_capacity = (ulint) innobase_io_capacity; + srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes; srv_n_file_io_threads = (ulint) innobase_file_io_threads; + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout; srv_force_recovery = (ulint) innobase_force_recovery; diff -r 322370200e6a sql/ha_innodb.h --- a/sql/ha_innodb.h Mon Nov 03 05:07:57 2008 -0800 +++ b/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800 @@ -197,6 +197,7 @@ extern struct show_var_st innodb_status_variables[]; extern ulong innobase_fast_shutdown; +extern long innobase_max_merged_io; extern ulong innobase_large_page_size; extern long innobase_mirrored_log_groups, innobase_log_files_in_group; extern longlong innobase_buffer_pool_size, innobase_log_file_size; @@ -205,10 +206,14 @@ extern long innobase_buffer_pool_awe_mem_mb; extern long innobase_file_io_threads, innobase_lock_wait_timeout; extern long innobase_force_recovery; +extern long innobase_read_io_threads, innobase_write_io_threads; extern long innobase_open_files; extern char *innobase_data_home_dir, *innobase_data_file_path; extern char *innobase_log_group_home_dir, *innobase_log_arch_dir; extern char *innobase_unix_file_flush_method; +extern long innobase_io_capacity; +extern my_bool innobase_extra_dirty_writes; + /* The following variables have to be my_bool for SHOW VARIABLES to work */ extern my_bool innobase_log_archive, innobase_use_doublewrite, diff -r 322370200e6a sql/mysqld.cc --- a/sql/mysqld.cc Mon Nov 03 05:07:57 2008 -0800 +++ b/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800 @@ -4932,6 +4932,11 @@ OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE, OPT_INNODB_MAX_PURGE_LAG, OPT_INNODB_FILE_IO_THREADS, + OPT_INNODB_READ_IO_THREADS, + OPT_INNODB_WRITE_IO_THREADS, + OPT_INNODB_MAX_MERGED_IO, + OPT_INNODB_IO_CAPACITY, + OPT_INNODB_EXTRA_DIRTY_WRITES, OPT_INNODB_LOCK_WAIT_TIMEOUT, OPT_INNODB_THREAD_CONCURRENCY, OPT_INNODB_COMMIT_CONCURRENCY, @@ -5302,6 +5307,25 @@ (gptr*) &global_system_variables.innodb_table_locks, (gptr*) &global_system_variables.innodb_table_locks, 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0}, + {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO, + "Max number of IO requests merged to issue large IO from background IO threads.", + (gptr*) &innobase_max_merged_io, + (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0}, + {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS, + "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads, + (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, + {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS, + "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads, + (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0}, + {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY, + "Number of IO operations per second the server can do. Tunes background IO rate.", + (gptr*) &innobase_io_capacity, + (gptr*) &innobase_io_capacity, 0, GET_LONG, + REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0}, + {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES, + "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ", + (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes, + 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0}, #endif /* End HAVE_INNOBASE_DB */ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.", (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0, diff -r 322370200e6a sql/set_var.cc --- a/sql/set_var.cc Mon Nov 03 05:07:57 2008 -0800 +++ b/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800 @@ -919,12 +919,14 @@ {"innodb_data_home_dir", (char*) &innobase_data_home_dir, SHOW_CHAR_PTR}, {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL}, {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL}, + {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL}, {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS}, {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG }, {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL}, {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS}, {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR}, {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG }, + {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG }, {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG }, {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL}, {"innodb_log_arch_dir", (char*) &innobase_log_arch_dir, SHOW_CHAR_PTR}, @@ -943,6 +945,9 @@ {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS}, {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS}, {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS}, + {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG }, + {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG }, + {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG}, #endif {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS}, {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS},