diff -r 322370200e6a innobase/include/os0file.h
--- a/innobase/include/os0file.h	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/include/os0file.h	Mon Nov 03 05:08:52 2008 -0800
@@ -532,21 +532,16 @@
 				   FALSE otherwise */
 	const char*	path);	/* in: path name */
 /****************************************************************************
-Initializes the asynchronous io system. Creates separate aio array for
-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
-segment, two aio arrays for log reads and writes with one segment, and a
-synchronous aio array of the specified size. The combined number of segments
-in the three first aio arrays is the parameter n_segments given to the
-function. The caller must create an i/o handler thread for each segment in
-the four first arrays, but not for the sync aio array. */
+Initializes the asynchronous io system. */
 
-void
+ulint
 os_aio_init(
 /*========*/
-	ulint	n,		/* in: maximum number of pending aio operations
-				allowed; n must be divisible by n_segments */
-	ulint	n_segments,	/* in: combined number of segments in the four
-				first aio arrays; must be >= 4 */
+                                /* out: number of AIO handler threads */
+	ulint	ios_per_array,	/* in: maximum number of pending aio operations
+                                   allowed per IO array */
+	ulint	n_read_threads,	/* in: number of read threads */
+	ulint	n_write_threads, /* in: number of write threads */
 	ulint	n_slots_sync);	/* in: number of slots in the sync aio array */
 /***********************************************************************
 Requests an asynchronous i/o operation. */
diff -r 322370200e6a innobase/include/srv0srv.h
--- a/innobase/include/srv0srv.h	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/include/srv0srv.h	Mon Nov 03 05:08:52 2008 -0800
@@ -87,6 +87,14 @@
 extern ulint	srv_lock_table_size;
 
 extern ulint	srv_n_file_io_threads;
+extern ulint    srv_n_read_io_threads;
+extern ulint    srv_n_write_io_threads;
+
+/* Number of IO operations per second the server can do */
+extern ulint    srv_io_capacity;
+
+/* Flush dirty pages when below max dirty percent */
+extern ibool  srv_extra_dirty_writes;
 
 #ifdef UNIV_LOG_ARCHIVE
 extern ibool	srv_log_archive_on;
@@ -252,6 +260,24 @@
 
 /* variable to count the number of random read-aheads were done */
 extern ulint srv_read_ahead_rnd;
+
+/* Number of IO operations read/write done for all threads */
+extern ulint os_aio_read_requests;
+extern ulint os_aio_write_requests;
+
+/* Number of pages read/written done for all threads */
+extern ulint os_aio_pages_read;
+extern ulint os_aio_pages_written;
+
+/* time usec used to perform read/write for all threads */
+extern ib_longlong os_aio_read_time;
+extern ib_longlong os_aio_write_time;
+
+extern ulint inno_pending_normal_aio_reads;
+extern ulint inno_pending_normal_aio_writes;
+extern ulint inno_pending_ibuf_aio_reads;
+extern ulint inno_pending_log_ios;
+extern ulint inno_pending_sync_ios;
 
 /* In this structure we store status variables to be passed to MySQL */
 typedef struct export_var_struct export_struc;
diff -r 322370200e6a innobase/log/log0log.c
--- a/innobase/log/log0log.c	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/log/log0log.c	Mon Nov 03 05:08:52 2008 -0800
@@ -1537,6 +1537,30 @@
 
 	log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE,
                         LOG_WRITE_FROM_BACKGROUND_SYNC);
+}
+
+/********************************************************************
+Flush the log buffer. Force it to disk depending on the value of
+innodb_flush_log_at_trx_commit. */
+
+void
+log_buffer_flush_maybe_sync(void)
+/*==========================*/
+{
+        dulint  lsn;
+
+        mutex_enter(&(log_sys->mutex));
+
+        lsn = log_sys->lsn;
+
+        mutex_exit(&(log_sys->mutex));
+
+        /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
+        log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
+                srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
+                srv_flush_log_at_trx_commit == 1 ?
+                        LOG_WRITE_FROM_BACKGROUND_SYNC :
+                        LOG_WRITE_FROM_BACKGROUND_ASYNC);
 }
 
 /********************************************************************
diff -r 322370200e6a innobase/os/os0file.c
--- a/innobase/os/os0file.c	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/os/os0file.c	Mon Nov 03 05:08:52 2008 -0800
@@ -22,6 +22,8 @@
 #include <errno.h>
 #endif /* UNIV_HOTBACKUP */
 
+extern long innobase_max_merged_io;
+
 #undef HAVE_FDATASYNC
 
 #ifdef POSIX_ASYNC_IO
@@ -63,6 +65,28 @@
 ibool	os_aio_use_native_aio	= FALSE;
 
 ibool	os_aio_print_debug	= FALSE;
+
+/* State for the state of an IO request in simulated AIO.
+   Protocol for simulated aio:
+     client requests IO: find slot with reserved = FALSE. Add entry with
+                         status = OS_AIO_NOT_ISSUED.
+     IO thread wakes: find adjacent slots with reserved = TRUE and status =
+                      OS_AIO_NOT_ISSUED. Change status for slots to
+                      OS_AIO_ISSUED.
+     IO operation completes: set status for slots to OS_AIO_DONE. set status
+                             for the first slot to OS_AIO_CLAIMED and return
+                             result for that slot.
+   When there are multiple read and write threads, they all compete to execute
+   the requests in the array (os_aio_array_t). This avoids the need to load
+   balance requests at the time the request is made at the cost of waking all
+   threads when a request is available.
+*/
+typedef enum {
+	OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
+	OS_AIO_ISSUED,     /* Being processed by an IO thread. */
+	OS_AIO_DONE,       /* Request processed. */
+	OS_AIO_CLAIMED     /* Result being returned to client. */
+} os_aio_status;
 
 /* The aio array slot structure */
 typedef struct os_aio_slot_struct	os_aio_slot_t;
@@ -72,6 +96,8 @@
 	ulint		pos;		/* index of the slot in the aio
 					array */
 	ibool		reserved;	/* TRUE if this slot is reserved */
+	os_aio_status   status;		/* Status for current request. Valid when reserved
+					is TRUE. Used only in simulated aio. */
 	time_t		reservation_time;/* time when reserved */
 	ulint		len;		/* length of the block to read or
 					write */
@@ -82,11 +108,6 @@
 	ulint		offset_high;	/* 32 high bits of file offset */
 	os_file_t	file;		/* file where to read or write */
 	const char*	name;		/* file name or path */
-	ibool		io_already_done;/* used only in simulated aio:
-					TRUE if the physical i/o already
-					made and only the slot message
-					needs to be passed to the caller
-					of os_aio_simulated_handle */
 	fil_node_t*	message1;	/* message which is given by the */
 	void*		message2;	/* the requester of an aio operation
 					and which can be used to identify
@@ -116,9 +137,6 @@
 				  in this array */
 	ulint		n_slots;  /* Total number of slots in the aio array.
 				  This must be divisible by n_threads. */
-	ulint		n_segments;/* Number of segments in the aio array of
-				  pending aio requests. A thread can wait
-				  separately for any one of the segments. */
 	ulint		n_reserved;/* Number of reserved slots in the
 				  aio array outside the ibuf segment */
 	os_aio_slot_t* 	slots;	  /* Pointer to the slots in the array */
@@ -134,6 +152,17 @@
 
 /* Array of events used in simulated aio */
 os_event_t*	os_aio_segment_wait_events	= NULL;
+
+/* Number of threads for reading and writing. */
+ulint os_aio_read_threads = 0;
+ulint os_aio_write_threads = 0;
+
+/* Number for the first global segment for reading. */
+const ulint os_aio_first_read_segment = 2;
+
+/* Number for the first global segment for writing. Set to
+2 + os_aio_read_write_threads. */
+ulint os_aio_first_write_segment = 0;
 
 /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
 are NULL when the module has not yet been initialized. */
@@ -143,11 +172,39 @@
 static os_aio_array_t*	os_aio_log_array	= NULL;
 static os_aio_array_t*	os_aio_sync_array	= NULL;
 
+/* Per thread buffer used for merged IO requests. Used by
+os_aio_simulated_handle so that a buffer doesn't have to be allocated
+for each request. */
+static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
+static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
+
+/* Count pages read and written per thread */
+static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS];
+static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS];
+
+/* Number of IO operations done. One request can be for N pages. */
+static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS];
+
+/* usecs spent blocked on an IO request */
+static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS];
+/* max usecs spent blocked on an IO request */
+static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS];
+
+/* Number of IO global segments. An IO handler thread is created for each
+global segment, except for the segment associated with os_aio_sync_array.
+Several segments can be associated with os_aio_{read,write}_array. One
+segment is created for each of the other arrays. This is also the number
+of valid entries in srv_io_thread_reads, srv_io_thread_writes,
+srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */
 static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
 
-/* If the following is TRUE, read i/o handler threads try to
-wait until a batch of new read requests have been posted */
-static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
+/* Set to TRUE to temporarily block reads from being scheduled while a batch
+of read requests is added to allow them to be merged by the IO handler thread
+if they are adjacent. Declared volatile because we don't want this to be
+read from a register in a loop when another thread may change the value in
+memory.
+*/
+static volatile ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
 
 ulint	os_n_file_reads		= 0;
 ulint	os_bytes_read_since_printout = 0;
@@ -166,6 +223,19 @@
 ulint	os_file_n_pending_pwrites = 0;
 ulint	os_n_pending_writes = 0;
 ulint	os_n_pending_reads = 0;
+
+/* TODO -- does InnoDB provide a portable method for this? */
+static double time_usecs() {
+#ifdef __WIN__
+  return 0.0;
+#else
+  struct timeval tv;
+  if (gettimeofday(&tv, NULL))
+    return 0;
+  else
+    return tv.tv_sec * 1000000.0 + tv.tv_usec;
+#endif
+}
 
 /***************************************************************************
 Gets the operating system version. Currently works only on Windows. */
@@ -1351,6 +1421,8 @@
 	/* We disable OS caching (O_DIRECT) only on data files */
 	if (type != OS_LOG_FILE
 	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
+
+ 		fprintf(stderr, "Using O_DIRECT for file %s\n", name);
 		
 		os_file_set_nocache(file, name, mode_str);
 	}
@@ -1798,6 +1870,32 @@
 #endif /* __WIN__ */
 }
 
+#ifndef __WIN__
+/***************************************************************************
+Possibly flushes a given file to disk. */
+
+ibool
+os_maybe_fsync(
+/*==========*/
+				/* out: 0 if success, error code otherwise */
+	os_file_t	file)	/* in, own: handle to a file */
+{
+  return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file);
+}
+
+/***************************************************************************
+Possibly flushes a given file to disk. */
+
+ibool
+os_maybe_fdatasync(
+/*==========*/
+				/* out: 0 if success, error code otherwise */
+	os_file_t	file)	/* in, own: handle to a file */
+{
+  return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file);
+}
+#endif
+
 /***************************************************************************
 Flushes the write buffers of a given file to the disk. */
 
@@ -1855,21 +1953,21 @@
 		/* If we are not on an operating system that supports this,
 		then fall back to a plain fsync. */ 
 
-		ret = fsync(file);
+		ret = os_maybe_fsync(file);
 	} else {
 		ret = fcntl(file, F_FULLFSYNC, NULL);
 
 		if (ret) {
 			/* If we are not on a file system that supports this,
 			then fall back to a plain fsync. */ 
-			ret = fsync(file);
+			ret = os_maybe_fsync(file);
 		}
 	}
 #elif HAVE_FDATASYNC
-	ret = fdatasync(file);
+	ret = os_maybe_fdatasync(file);
 #else
 /*	fprintf(stderr, "Flushing to file %p\n", file); */
-	ret = fsync(file);
+	ret = os_maybe_fsync(file);
 #endif
 	os_n_fsyncs++;
 
@@ -2298,6 +2396,9 @@
 
 		return(TRUE);
 	}
+	fprintf(stderr,
+"InnoDB: error: os_file_pread wanted %lu and got %lu.\n",
+		(ulint) n, (ulint) ret);
 #endif	
 #ifdef __WIN__
 error_handling:
@@ -2784,9 +2885,8 @@
 os_aio_array_create(
 /*================*/
 				/* out, own: aio array */
-	ulint	n,		/* in: maximum number of pending aio operations
-				allowed; n must be divisible by n_segments */
-	ulint	n_segments) 	/* in: number of segments in the aio array */
+	ulint	n)		/* in: maximum number of pending aio operations
+                                 allowed */
 {
 	os_aio_array_t*	array;
 	ulint		i;
@@ -2795,7 +2895,6 @@
 	OVERLAPPED*	over;
 #endif	
 	ut_a(n > 0);
-	ut_a(n_segments > 0);
 
 	array = ut_malloc(sizeof(os_aio_array_t));
 
@@ -2806,7 +2905,6 @@
 	os_event_set(array->is_empty);
 	
 	array->n_slots  	= n;
-	array->n_segments	= n_segments;
 	array->n_reserved	= 0;
 	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
 #ifdef __WIN__
@@ -2833,70 +2931,75 @@
 
 /****************************************************************************
 Initializes the asynchronous io system. Calls also os_io_init_simple.
-Creates a separate aio array for
-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
-segment, two aio arrays for log reads and writes with one segment, and a
-synchronous aio array of the specified size. The combined number of segments
-in the three first aio arrays is the parameter n_segments given to the
-function. The caller must create an i/o handler thread for each segment in
-the four first arrays, but not for the sync aio array. */
-
-void
+Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO,
+log IO, and synchronous IO. The caller must create i/o handler thread for all
+but the synchronous aio array. Multiple threads can access the same array for
+the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays.
+Return the number of AIO handler threads. */
+
+ulint
 os_aio_init(
 /*========*/
-	ulint	n,		/* in: maximum number of pending aio operations
-				allowed; n must be divisible by n_segments */
-	ulint	n_segments,	/* in: combined number of segments in the four
-				first aio arrays; must be >= 4 */
+	ulint	ios_per_array,	/* in: maximum number of pending aio operations
+                                allowed per array */
+	ulint	n_read_threads, /* in: number of read threads */
+	ulint	n_write_threads, /* in: number of write threads */
 	ulint	n_slots_sync)	/* in: number of slots in the sync aio array */
 {
-	ulint	n_read_segs;
-	ulint	n_write_segs;
-	ulint	n_per_seg;
-	ulint	i;
+	ulint	i;
+	ulint   n_segments = 2 + n_read_threads + n_write_threads;
 #ifdef POSIX_ASYNC_IO
 	sigset_t   sigset;
 #endif
-	ut_ad(n % n_segments == 0);
-	ut_ad(n_segments >= 4);
+	ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD);
+	ut_a(n_read_threads >= 1 && n_read_threads <= 64);
+	ut_a(n_write_threads >= 1 && n_write_threads <= 64);
+	ut_a(n_segments < SRV_MAX_N_IO_THREADS);
 
 	os_io_init_simple();
 
 	for (i = 0; i < n_segments; i++) {
 	        srv_set_io_thread_op_info(i, "not started yet");
-	}
-
-	n_per_seg = n / n_segments;
-	n_write_segs = (n_segments - 2) / 2;
-	n_read_segs = n_segments - 2 - n_write_segs;
-	
-	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
-
-	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+                os_aio_thread_io_reads[i] = 0;
+                os_aio_thread_io_writes[i] = 0;
+                os_aio_thread_io_requests[i] = 0;
+                os_aio_thread_buffer[i] = 0;
+                os_aio_thread_buffer_size[i] = 0;
+                os_aio_thread_io_wait[i] = 0;
+                os_aio_thread_max_io_wait[i] = 0;
+	}
+
+	os_aio_read_threads = n_read_threads;
+	os_aio_write_threads = n_write_threads;
+	os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads;
+
+	fprintf(stderr,
+		"InnoDB: ios_per_array %lu read threads %lu write threads %lu\n",
+		ios_per_array, os_aio_read_threads, os_aio_write_threads);
+
+	os_aio_ibuf_array = os_aio_array_create(ios_per_array);
 
 	srv_io_thread_function[0] = "insert buffer thread";
 
-	os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+	os_aio_log_array = os_aio_array_create(ios_per_array);
 
 	srv_io_thread_function[1] = "log thread";
 
-	os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
-							n_read_segs);
-	for (i = 2; i < 2 + n_read_segs; i++) {
+	os_aio_read_array = os_aio_array_create(ios_per_array);
+	for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) {
 		ut_a(i < SRV_MAX_N_IO_THREADS);
-	        srv_io_thread_function[i] = "read thread";
-	}
-
-	os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
-							n_write_segs);
-	for (i = 2 + n_read_segs; i < n_segments; i++) {
+		srv_io_thread_function[i] = "read thread";
+	}
+
+	os_aio_write_array = os_aio_array_create(ios_per_array);
+	for (i = os_aio_first_write_segment; i < n_segments; i++) {
 		ut_a(i < SRV_MAX_N_IO_THREADS);
-	        srv_io_thread_function[i] = "write thread";
-	}
-
-	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
-
-	os_aio_n_segments = n_segments;
+		srv_io_thread_function[i] = "write thread";
+	}
+
+	os_aio_sync_array = os_aio_array_create(n_slots_sync);
+
+	os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads;
 
 	os_aio_validate();
 
@@ -2924,6 +3027,7 @@
 
 	pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
 #endif
+	return os_aio_n_segments;
 }
 
 #ifdef WIN_ASYNC_IO
@@ -2981,77 +3085,32 @@
 	os_event_wait(os_aio_write_array->is_empty);
 }
 
-/**************************************************************************
-Calculates segment number for a slot. */
-static
-ulint
-os_aio_get_segment_no_from_slot(
-/*============================*/
-				/* out: segment number (which is the number
-				used by, for example, i/o-handler threads) */
-	os_aio_array_t*	array,	/* in: aio wait array */
-	os_aio_slot_t*	slot)	/* in: slot in this array */
-{
-	ulint	segment;
-	ulint	seg_len;
-
-	if (array == os_aio_ibuf_array) {
-		segment = 0;
-
-	} else if (array == os_aio_log_array) {
-		segment = 1;
-		
-	} else if (array == os_aio_read_array) {
-		seg_len = os_aio_read_array->n_slots /
-				os_aio_read_array->n_segments;
-
-		segment = 2 + slot->pos / seg_len;
-	} else {
-		ut_a(array == os_aio_write_array);
-		seg_len = os_aio_write_array->n_slots /
-				os_aio_write_array->n_segments;
-
-		segment = os_aio_read_array->n_segments + 2
-				+ slot->pos / seg_len;
-	}
-
-	return(segment);
-}
-
-/**************************************************************************
-Calculates local segment number and aio array from global segment number. */
-static
-ulint
-os_aio_get_array_and_local_segment(
+
+/**************************************************************************
+Calculates aio array from global segment number. */
+static
+os_aio_array_t*
+os_aio_get_array(
 /*===============================*/
-					/* out: local segment number within
-					the aio array */
-	os_aio_array_t** array,		/* out: aio wait array */
+	/* out: aio wait array */
 	ulint		 global_segment)/* in: global segment number */
 {
-	ulint	segment;
 
 	ut_a(global_segment < os_aio_n_segments);	
 
 	if (global_segment == 0) {
-		*array = os_aio_ibuf_array;
-		segment = 0;
+		return os_aio_ibuf_array;
 
 	} else if (global_segment == 1) {
-		*array = os_aio_log_array;
-		segment = 0;
-		
-	} else if (global_segment < os_aio_read_array->n_segments + 2) {
-		*array = os_aio_read_array;
-
-		segment = global_segment - 2;
-	} else {
-		*array = os_aio_write_array;
-
-		segment = global_segment - (os_aio_read_array->n_segments + 2);
-	}
-
-	return(segment);
+		return os_aio_log_array;
+		
+	} else if (global_segment < os_aio_first_write_segment) {
+		return os_aio_read_array;
+
+	} else {
+		return os_aio_write_array;
+
+	}
 }
 
 /***********************************************************************
@@ -3160,7 +3219,7 @@
 
 			os_aio_simulated_wake_handler_threads();
 		}
-		
+
 		os_event_wait(array->not_full);
 
 		goto loop;
@@ -3173,7 +3232,7 @@
 			break;
 		}
 	}
-
+  ut_a(i < array->n_slots);
 	array->n_reserved++;
 
 	if (array->n_reserved == 1) {
@@ -3195,7 +3254,7 @@
 	slot->buf      = buf;
 	slot->offset   = offset;
 	slot->offset_high = offset_high;
-	slot->io_already_done = FALSE;
+	slot->status   = OS_AIO_NOT_ISSUED;
 	
 #ifdef WIN_ASYNC_IO		
 	control = &(slot->control);
@@ -3246,8 +3305,9 @@
 	os_mutex_enter(array->mutex);
 
 	ut_ad(slot->reserved);
-	
+
 	slot->reserved = FALSE;
+	slot->status = OS_AIO_NOT_ISSUED;
 
 	array->n_reserved--;
 
@@ -3266,36 +3326,40 @@
 }
 
 /**************************************************************************
-Wakes up a simulated aio i/o-handler thread if it has something to do. */
+Wake up the simulated aio i/o-handler threads for a given array if there
+is work to do. */
 static
 void
 os_aio_simulated_wake_handler_thread(
 /*=================================*/
-	ulint	global_segment)	/* in: the number of the segment in the aio
-				arrays */
-{
-	os_aio_array_t*	array;
-	os_aio_slot_t*	slot;
-	ulint		segment;
+		os_aio_array_t* array)	/* in: aio array for which wakeup is done */
+{
+	os_aio_slot_t*	slot;
 	ulint		n;
 	ulint		i;
 
 	ut_ad(!os_aio_use_native_aio);
 
-	segment = os_aio_get_array_and_local_segment(&array, global_segment);
-
-	n = array->n_slots / array->n_segments;
-
-	/* Look through n slots after the segment * n'th slot */
-
-	os_mutex_enter(array->mutex);
-
-	for (i = 0; i < n; i++) {
-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
-
-		if (slot->reserved) {
-			/* Found an i/o request */
-			
+	n = array->n_slots;
+
+	/* Look through n slots */
+
+	os_mutex_enter(array->mutex);
+
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i );
+
+		if (slot->reserved &&
+		    (slot->status == OS_AIO_NOT_ISSUED ||
+		     slot->status == OS_AIO_DONE)) {
+			/* Found an i/o request
+      /* OS_AIO_NOT_ISSUED means the read or write request has
+       * yet to be done. OS_AIO_DONE means the request has been
+       * done but it was part of a set of requests merged into
+       * one read or write call and was not the first block in
+       * the request, so the handling of the IO completion for
+       * that block has not been done. */
+
 			break;
 		}
 	}
@@ -3303,7 +3367,25 @@
 	os_mutex_exit(array->mutex);
 
 	if (i < n) {
-		os_event_set(os_aio_segment_wait_events[global_segment]);
+		if (array == os_aio_ibuf_array) {
+			os_event_set(os_aio_segment_wait_events[0]);
+
+		} else if (array == os_aio_log_array) {
+			os_event_set(os_aio_segment_wait_events[1]);
+
+		} else if (array == os_aio_read_array) {
+			ulint	x;
+			for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
+				os_event_set(os_aio_segment_wait_events[x]);
+
+		} else if (array == os_aio_write_array) {
+			ulint	x;
+			for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
+				os_event_set(os_aio_segment_wait_events[x]);
+
+		} else {
+			ut_a(0);
+		}
 	}
 }
 
@@ -3320,13 +3402,14 @@
 		/* We do not use simulated aio: do nothing */
 
 		return;
-	}	
-
-	os_aio_recommend_sleep_for_read_threads	= FALSE;
-
-	for (i = 0; i < os_aio_n_segments; i++) {
-		os_aio_simulated_wake_handler_thread(i);
-	}
+	}
+
+        os_aio_recommend_sleep_for_read_threads	= FALSE;
+
+	os_aio_simulated_wake_handler_thread(os_aio_ibuf_array);
+	os_aio_simulated_wake_handler_thread(os_aio_log_array);
+        os_aio_simulated_wake_handler_thread(os_aio_read_array);
+	os_aio_simulated_wake_handler_thread(os_aio_write_array);
 }
 
 /**************************************************************************
@@ -3339,18 +3422,13 @@
 os_aio_simulated_put_read_threads_to_sleep(void)
 /*============================================*/
 {
-	os_aio_array_t*	array;
 	ulint		g;
 
+	/* TODO(mcallaghan): provide similar function for write? */
 	os_aio_recommend_sleep_for_read_threads	= TRUE;
 
-	for (g = 0; g < os_aio_n_segments; g++) {
-		os_aio_get_array_and_local_segment(&array, g);
-
-		if (array == os_aio_read_array) {
-		
-			os_event_reset(os_aio_segment_wait_events[g]);
-		}
+	for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) {
+		os_event_reset(os_aio_segment_wait_events[g]);
 	}
 }
 
@@ -3480,8 +3558,7 @@
 #endif
 		} else {
 			if (!wake_later) {
-				os_aio_simulated_wake_handler_thread(
-				 os_aio_get_segment_no_from_slot(array, slot));
+				os_aio_simulated_wake_handler_thread(array);
 			}
 		}
 	} else if (type == OS_FILE_WRITE) {
@@ -3497,8 +3574,7 @@
 #endif
 		} else {
 			if (!wake_later) {
-				os_aio_simulated_wake_handler_thread(
-				 os_aio_get_segment_no_from_slot(array, slot));
+				os_aio_simulated_wake_handler_thread(array);
 			}
 		}
 	} else {
@@ -3561,7 +3637,7 @@
 os_aio_windows_handle(
 /*==================*/
 				/* out: TRUE if the aio operation succeeded */
-	ulint	segment,	/* in: the number of the segment in the aio
+	ulint	global_segment,	/* in: the number of the segment in the aio
 				arrays to wait for; segment 0 is the ibuf
 				i/o thread, segment 1 the log i/o thread,
 				then follow the non-ibuf read threads, and as
@@ -3579,7 +3655,6 @@
 	void**	message2,
 	ulint*	type)		/* out: OS_FILE_WRITE or ..._READ */
 {
-	ulint		orig_seg	= segment;
 	os_aio_array_t*	array;
 	os_aio_slot_t*	slot;
 	ulint		n;
@@ -3588,33 +3663,30 @@
 	BOOL		ret;
 	DWORD		len;
 
-	if (segment == ULINT_UNDEFINED) {
+	if (global_segment == ULINT_UNDEFINED) {
 		array = os_aio_sync_array;
-		segment = 0;
-	} else {
-		segment = os_aio_get_array_and_local_segment(&array, segment);
+	} else {
+                array = os_aio_get_array(global_segment);
 	}
 	
 	/* NOTE! We only access constant fields in os_aio_array. Therefore
 	we do not have to acquire the protecting mutex yet */
 
 	ut_ad(os_aio_validate());
-	ut_ad(segment < array->n_segments);
-
-	n = array->n_slots / array->n_segments;
+
+	n = array->n_slots;
 
 	if (array == os_aio_sync_array) {
 		os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
 		i = pos;
 	} else {
-		srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
-		i = os_event_wait_multiple(n,
-				(array->native_events) + segment * n);
-	}
-
-	os_mutex_enter(array->mutex);
-
-	slot = os_aio_array_get_nth_slot(array, i + segment * n);
+		srv_set_io_thread_op_info(global_segment, "wait Windows aio");
+		i = os_event_wait_multiple(n, (array->native_events));
+	}
+
+	os_mutex_enter(array->mutex);
+
+	slot = os_aio_array_get_nth_slot(array, i);
 
 	ut_a(slot->reserved);
 
@@ -3787,14 +3859,16 @@
 	ulint*	type)		/* out: OS_FILE_WRITE or ..._READ */
 {
 	os_aio_array_t*	array;
-	ulint		segment;
 	os_aio_slot_t*	slot;
 	os_aio_slot_t*	slot2;
 	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+	os_aio_slot_t*  lowest_request;
+	os_aio_slot_t*	oldest_request;
 	ulint		n_consecutive;
 	ulint		total_len;
 	ulint		offs;
 	ulint		lowest_offset;
+	ulint		oldest_offset;
 	ulint		biggest_age;
 	ulint		age;
 	byte*		combined_buf;
@@ -3802,8 +3876,10 @@
 	ibool		ret;
 	ulint		n;
 	ulint		i;
-	
-	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+	double          start_usecs, stop_usecs, elapsed_usecs;
+	time_t          now;
+	array = os_aio_get_array(global_segment);	
 	
 restart:
 	/* NOTE! We only access constant fields in os_aio_array. Therefore
@@ -3812,11 +3888,10 @@
 	srv_set_io_thread_op_info(global_segment,
 					"looking for i/o requests (a)");
 	ut_ad(os_aio_validate());
-	ut_ad(segment < array->n_segments);
-
-	n = array->n_slots / array->n_segments;
-
-	/* Look through n slots after the segment * n'th slot */
+
+	n = array->n_slots;
+
+	/* Look through n slots */
 
 	if (array == os_aio_read_array
 	    && os_aio_recommend_sleep_for_read_threads) {
@@ -3836,9 +3911,9 @@
 	done */
 	
 	for (i = 0; i < n; i++) {
-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
-
-		if (slot->reserved && slot->io_already_done) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved && slot->status == OS_AIO_DONE) {
 
 			if (os_aio_print_debug) {
 				fprintf(stderr,
@@ -3846,79 +3921,66 @@
 			}
 
 			ret = TRUE;
-			
+
 			goto slot_io_done;
 		}
 	}
 
-	n_consecutive = 0;
-
-	/* If there are at least 2 seconds old requests, then pick the oldest
-	one to prevent starvation. If several requests have the same age,
-	then pick the one at the lowest offset. */
-
 	biggest_age = 0;
-	lowest_offset = ULINT_MAX;
-
-	for (i = 0; i < n; i++) {
-		slot = os_aio_array_get_nth_slot(array, i + segment * n);
-
-		if (slot->reserved) {
-		        age = (ulint)difftime(time(NULL),
-						slot->reservation_time);
-
+	now = time(NULL);
+        oldest_request = lowest_request = NULL;
+        oldest_offset = lowest_offset = ULINT_MAX;
+
+        /* Find the oldest request and the request with the smallest offset */
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
+			age = (ulint)difftime(now, slot->reservation_time);
+
+			/* If there are at least 2 seconds old requests, then pick the oldest
+			   one to prevent starvation. If several requests have the same age,
+			   then pick the one at the lowest offset. */
 			if ((age >= 2 && age > biggest_age)
 			    || (age >= 2 && age == biggest_age
-			        && slot->offset < lowest_offset)) {
+			        && slot->offset < oldest_offset)) {
 
 			        /* Found an i/o request */
-				consecutive_ios[0] = slot;
-
-				n_consecutive = 1;
-
 				biggest_age = age;
+				oldest_request = slot;
+				oldest_offset = slot->offset;
+			}
+
+			/* Look for an i/o request at the lowest offset in the array
+			 * (we ignore the high 32 bits of the offset) */
+			if (slot->offset < lowest_offset) {
+			        /* Found an i/o request */
+				lowest_request = slot;
 				lowest_offset = slot->offset;
 			}
 		}
 	}
 
-	if (n_consecutive == 0) {
-	        /* There were no old requests. Look for an i/o request at the
-		lowest offset in the array (we ignore the high 32 bits of the
-		offset in these heuristics) */
-
-		lowest_offset = ULINT_MAX;
-	
-		for (i = 0; i < n; i++) {
-		        slot = os_aio_array_get_nth_slot(array,
-							i + segment * n);
-
-			if (slot->reserved && slot->offset < lowest_offset) {
-
-			        /* Found an i/o request */
-				consecutive_ios[0] = slot;
-
-				n_consecutive = 1;
-
-				lowest_offset = slot->offset;
-			}
-		}
-	}
-
-	if (n_consecutive == 0) {
+	if (!lowest_request && !oldest_request) {
 
 		/* No i/o requested at the moment */
 
 		goto wait_for_io;
 	}
 
-	slot = consecutive_ios[0];
+        if (oldest_request) {
+		slot = oldest_request;
+        } else {
+		slot = lowest_request;
+        }
+        consecutive_ios[0] = slot;
+	n_consecutive = 1;
 
 	/* Check if there are several consecutive blocks to read or write */
 
 consecutive_loop:	
 	for (i = 0; i < n; i++) {
-		slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
+		slot2 = os_aio_array_get_nth_slot(array, i);
 
 		if (slot2->reserved && slot2 != slot
 		    && slot2->offset == slot->offset + slot->len
@@ -3926,7 +3988,8 @@
 						sum does not wrap over */
 		    && slot2->offset_high == slot->offset_high
 		    && slot2->type == slot->type
-		    && slot2->file == slot->file) {
+				&& slot2->file == slot->file
+				&& slot2->status == OS_AIO_NOT_ISSUED) { 
 
 			/* Found a consecutive i/o request */
 
@@ -3935,7 +3998,8 @@
 
 			slot = slot2;
 
-			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE
+				&& n_consecutive < innobase_max_merged_io) {
 			
 				goto consecutive_loop;
 			} else {
@@ -3955,6 +4019,8 @@
 	
 	for (i = 0; i < n_consecutive; i++) {
 		total_len += consecutive_ios[i]->len;
+		ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
+		consecutive_ios[i]->status = OS_AIO_ISSUED;
 	}
 
 	if (n_consecutive == 1) {
@@ -3962,7 +4028,16 @@
 		combined_buf = slot->buf;
 		combined_buf2 = NULL;
 	} else {
-		combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
+		if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
+
+			if (os_aio_thread_buffer[global_segment])
+				ut_free(os_aio_thread_buffer[global_segment]);
+
+			os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
+ 
+			os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
+		}
+		combined_buf2 = os_aio_thread_buffer[global_segment];
 
 		ut_a(combined_buf2);
 
@@ -3973,6 +4048,9 @@
 	this assumes that there is just one i/o-handler thread serving
 	a single segment of slots! */
 
+	ut_a(slot->reserved);
+	ut_a(slot->status == OS_AIO_ISSUED);
+
 	os_mutex_exit(array->mutex);
 
 	if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
@@ -3998,6 +4076,7 @@
 
 	/* Do the i/o with ordinary, synchronous i/o functions: */
 	if (slot->type == OS_FILE_WRITE) {
+		os_aio_thread_io_writes[global_segment] += n_consecutive;
 		if (array == os_aio_write_array) {
 			if ((total_len % UNIV_PAGE_SIZE != 0)
 			    || (slot->offset % UNIV_PAGE_SIZE != 0)) {
@@ -4012,16 +4091,34 @@
 			os_file_check_page_trailers(combined_buf, total_len);
 		}
 
+		start_usecs = time_usecs();
 		ret = os_file_write(slot->name, slot->file, combined_buf,
 				slot->offset, slot->offset_high, total_len);
-
+		stop_usecs = time_usecs();
+                elapsed_usecs = stop_usecs - start_usecs;
+                if (elapsed_usecs < 0) elapsed_usecs = 0;
 		if (array == os_aio_write_array) {
 			os_file_check_page_trailers(combined_buf, total_len);
 		}
-	} else {
+                os_aio_write_requests++;
+                os_aio_pages_written += n_consecutive;
+                os_aio_write_time += (ib_longlong)elapsed_usecs;
+	} else {
+		start_usecs = time_usecs();
+		os_aio_thread_io_reads[global_segment] += n_consecutive;
 		ret = os_file_read(slot->file, combined_buf,
 				slot->offset, slot->offset_high, total_len);
-	}
+		stop_usecs = time_usecs();
+                elapsed_usecs = stop_usecs - start_usecs;
+                if (elapsed_usecs < 0) elapsed_usecs = 0;
+                os_aio_read_requests++;
+                os_aio_pages_read += n_consecutive;
+                os_aio_read_time += (ib_longlong)elapsed_usecs;
+	}
+	if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment])
+		os_aio_thread_max_io_wait[global_segment] = elapsed_usecs;
+	os_aio_thread_io_wait[global_segment] += elapsed_usecs;
+	os_aio_thread_io_requests[global_segment]++;
 
 	ut_a(ret);
 	srv_set_io_thread_op_info(global_segment, "file i/o done");
@@ -4042,16 +4139,13 @@
 		}
 	}
 
-	if (combined_buf2) {
-		ut_free(combined_buf2);
-	}
-
 	os_mutex_enter(array->mutex);
 
 	/* Mark the i/os done in slots */
 
 	for (i = 0; i < n_consecutive; i++) {
-		consecutive_ios[i]->io_already_done = TRUE;
+		ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
+		consecutive_ios[i]->status = OS_AIO_DONE;
 	}
 
 	/* We return the messages for the first slot now, and if there were
@@ -4061,6 +4155,8 @@
 slot_io_done:
 
 	ut_a(slot->reserved);
+	ut_a(slot->status == OS_AIO_DONE);
+	slot->status = OS_AIO_CLAIMED;
 
 	*message1 = slot->message1;
 	*message2 = slot->message2;
@@ -4070,7 +4166,8 @@
 	os_mutex_exit(array->mutex);
 
 	os_aio_array_free_slot(array, slot);
-	
+	srv_set_io_thread_op_info(global_segment, "exited handler");
+
 	return(ret);
 
 wait_for_io:
@@ -4115,7 +4212,6 @@
 	os_mutex_enter(array->mutex);
 
 	ut_a(array->n_slots > 0);
-	ut_a(array->n_segments > 0);
 	
 	for (i = 0; i < array->n_slots; i++) {
 		slot = os_aio_array_get_nth_slot(array, i);
@@ -4165,11 +4261,20 @@
 	double		time_elapsed;
 	double		avg_bytes_read;
 	ulint		i;
-
-	for (i = 0; i < srv_n_file_io_threads; i++) {
-		fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
-					srv_io_thread_op_info[i],
-					srv_io_thread_function[i]);
+	ulint		num_issued, num_done, num_claimed;
+
+	if (file) {
+	for (i = 0; i < os_aio_n_segments; i++) {
+		fprintf(file,
+			"I/O thread %lu state: %s (%s) reads %lu writes %lu "
+			"requests %lu io secs %lf io msecs/request %lf max_io_wait %lf",
+			i, srv_io_thread_op_info[i], srv_io_thread_function[i],
+			os_aio_thread_io_reads[i], os_aio_thread_io_writes[i],
+			os_aio_thread_io_requests[i],
+			os_aio_thread_io_wait[i] / 1000000.0,
+			os_aio_thread_io_requests[i] ?
+			os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0,
+			os_aio_thread_max_io_wait[i] / 1000.0);
 
 #ifndef __WIN__
         	if (os_aio_segment_wait_events[i]->is_set) {
@@ -4181,6 +4286,7 @@
 	}
 
 	fputs("Pending normal aio reads:", file);
+	} // if (file)
 
 	array = os_aio_read_array;
 loop:
@@ -4189,14 +4295,23 @@
 	os_mutex_enter(array->mutex);
 
 	ut_a(array->n_slots > 0);
-	ut_a(array->n_segments > 0);
 	
 	n_reserved = 0;
+	num_done = num_issued = num_claimed = 0;
 
 	for (i = 0; i < array->n_slots; i++) {
 		slot = os_aio_array_get_nth_slot(array, i);
 	
 		if (slot->reserved) {
+			if (slot->status == OS_AIO_ISSUED)
+				num_issued++;
+			else if (slot->status == OS_AIO_DONE)
+				num_done++;
+			else {
+				ut_ad(slot->status == OS_AIO_CLAIMED);
+				num_claimed++;
+			}
+
 			n_reserved++;
 			/* fprintf(stderr, "Reserved slot, messages %p %p\n",
 				slot->message1, slot->message2); */
@@ -4206,42 +4321,56 @@
 
 	ut_a(array->n_reserved == n_reserved);
 
-	fprintf(file, " %lu", (ulong) n_reserved);
-	
+	if (file) fprintf(file, " %lu", (ulong) n_reserved);
+
 	os_mutex_exit(array->mutex);
 
 	if (array == os_aio_read_array) {
-		fputs(", aio writes:", file);
-	
+		inno_pending_normal_aio_reads = (ulong) n_reserved;
+		if (file) fputs(", aio writes:", file);
 		array = os_aio_write_array;
 
 		goto loop;
 	}
 
 	if (array == os_aio_write_array) {
-		fputs(",\n ibuf aio reads:", file);
+		inno_pending_normal_aio_writes = (ulong) n_reserved;
+		if (file) fputs(",\n ibuf aio reads:", file);
 		array = os_aio_ibuf_array;
 
 		goto loop;
 	}
 
 	if (array == os_aio_ibuf_array) {
-		fputs(", log i/o's:", file);
+		inno_pending_ibuf_aio_reads = (ulong) n_reserved;
+		if (file) fputs(", log i/o's:", file);
 		array = os_aio_log_array;
 
 		goto loop;
 	}
 
 	if (array == os_aio_log_array) {
-		fputs(", sync i/o's:", file);
+		inno_pending_log_ios = (ulong) n_reserved;
+		if (file) fputs(", sync i/o's:", file);
 		array = os_aio_sync_array;
 
 		goto loop;
 	}
 
-	putc('\n', file);
+	if (array == os_aio_sync_array) {
+		inno_pending_sync_ios = (ulong) n_reserved;
+	}
+
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+	if (file) {
+	putc('\n', file);
+	fprintf(file,
+		"Summary of background IO slot status: %lu issued, "
+		"%lu done, %lu claimed, sleep set %d\n",
+		num_issued, num_done, num_claimed,
+		os_aio_recommend_sleep_for_read_threads);
 
 	fprintf(file,
 		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
@@ -4274,6 +4403,7 @@
 		/ time_elapsed,
 		(os_n_fsyncs - os_n_fsyncs_old)
 		/ time_elapsed);
+	} // if (file)
 
 	os_n_file_reads_old = os_n_file_reads;
 	os_n_file_writes_old = os_n_file_writes;
diff -r 322370200e6a innobase/srv/srv0srv.c
--- a/innobase/srv/srv0srv.c	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/srv/srv0srv.c	Mon Nov 03 05:08:52 2008 -0800
@@ -164,7 +164,17 @@
 ulint	srv_mem_pool_size	= ULINT_MAX;	/* size in bytes */ 
 ulint	srv_lock_table_size	= ULINT_MAX;
 
+ulint   srv_io_capacity         = ULINT_MAX;    /* Number of IO operations per
+                                                   second the server can do */
+
+ibool   srv_extra_dirty_writes = TRUE;  /* Write dirty pages to disk when pct
+                                           dirty < max dirty pct */
+
+/* Deprecated by srv_n_{read,write}_io_threads */
 ulint	srv_n_file_io_threads	= ULINT_MAX;
+/* Number of background IO threads for read and write requests */
+ulint   srv_n_read_io_threads   = ULINT_MAX;
+ulint   srv_n_write_io_threads  = ULINT_MAX;
 
 #ifdef UNIV_LOG_ARCHIVE
 ibool	srv_log_archive_on	= FALSE;
@@ -238,6 +248,24 @@
 
 /* variable to count the number of random read-aheads */
 ulint srv_read_ahead_rnd = 0;
+
+/* Number of IO operations read/write done for all threads */
+ulint os_aio_read_requests = 0;
+ulint os_aio_write_requests = 0;
+
+/* Number of pages read/written done for all threads */
+ulint os_aio_pages_read = 0;
+ulint os_aio_pages_written = 0;
+
+/* time usec used to perform read/write for all threads */
+ib_longlong os_aio_read_time = 0;
+ib_longlong os_aio_write_time = 0;
+
+ulint inno_pending_normal_aio_reads = 0;
+ulint inno_pending_normal_aio_writes = 0;
+ulint inno_pending_ibuf_aio_reads = 0;
+ulint inno_pending_log_ios = 0;
+ulint inno_pending_sync_ios = 0;
 
 /* structure to pass status variables to MySQL */
 export_struc export_vars;
@@ -413,6 +441,23 @@
 
 ulint	srv_main_thread_process_no	= 0;
 ulint	srv_main_thread_id		= 0;
+
+// The following count work done by srv_master_thread.
+
+// Iterations by the 'once per second' loop.
+ulint   srv_main_1_second_loops         = 0;
+// Calls to sleep by the 'once per second' loop.
+ulint   srv_main_sleeps                 = 0;
+// Iterations by the 'once per 10 seconds' loop.
+ulint   srv_main_10_second_loops        = 0;
+// Iterations of the loop bounded by the 'background_loop' label.
+ulint   srv_main_background_loops       = 0;
+// Iterations of the loop bounded by the 'flush_loop' label.
+ulint   srv_main_flush_loops            = 0;
+// Calls to log_buffer_flush_to_disk.
+ulint   srv_sync_flush                  = 0;
+// Calls to log_buffer_flush_maybe_sync.
+ulint   srv_async_flush                 = 0;
 
 /*
 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
@@ -2170,7 +2215,12 @@
 }
 
 /*************************************************************************
-The master thread controlling the server. */
+Returns the number of IO operations that is X percent of the capacity.
+
+PCT_IO(5) -> returns the number of IO operations that is 5% of the max
+where max is srv_io_capacity.
+*/
+#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
 
 #ifndef __WIN__
 void*
@@ -2199,11 +2249,15 @@
 	ulint		n_pend_ios;
 	ibool		skip_sleep	= FALSE;
 	ulint		i;
+
 	
 #ifdef UNIV_DEBUG_THREAD_CREATION
 	fprintf(stderr, "Master thread starts, id %lu\n",
 			      os_thread_pf(os_thread_get_curr_id()));
 #endif
+        fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n",
+                srv_io_capacity);
+
 	srv_main_thread_process_no = os_proc_get_number();
 	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
 	
@@ -2275,26 +2329,28 @@
 
 		srv_main_thread_op_info = "flushing log";
 		log_buffer_flush_to_disk();
+		srv_sync_flush++;
 
 		srv_main_thread_op_info = "making checkpoint";
 		log_free_check();
 
-		/* If there were less than 5 i/os during the
-		one second sleep, we assume that there is free
-		disk i/o capacity available, and it makes sense to
-		do an insert buffer merge. */
+		/* If i/os during one second sleep were less than 5% of
+                capacity, we assume that there is free disk i/o capacity
+                available, and it makes sense to do an insert buffer merge. */
 
 		n_pend_ios = buf_get_n_pending_ios()
 						+ log_sys->n_pending_writes;
 		n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
 						+ buf_pool->n_pages_written;
-		if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
+		if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
 			srv_main_thread_op_info = "doing insert buffer merge";
-			ibuf_contract_for_n_pages(TRUE, 5);
+			ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
 
 			srv_main_thread_op_info = "flushing log";
 
-			log_buffer_flush_to_disk();
+			/* No fsync when srv_flush_log_at_trx_commit != 1 */
+			log_buffer_flush_maybe_sync();
+			srv_async_flush++;
 		}
 
 		if (buf_get_modified_ratio_pct() >
@@ -2303,7 +2359,8 @@
 			/* Try to keep the number of modified pages in the
 			buffer pool under the limit wished by the user */
 			
-			n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
+			n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+                                                          PCT_IO(100),
 							  ut_dulint_max);
 
 		        /* If we had to do the flush, it may have taken
@@ -2325,36 +2382,47 @@
 
 	/* ---- We perform the following code approximately once per
 	10 seconds when there is database activity */
+        srv_main_10_second_loops++;
 
 #ifdef MEM_PERIODIC_CHECK
 	/* Check magic numbers of every allocated mem block once in 10
 	seconds */
 	mem_validate_all_blocks();
 #endif	
-	/* If there were less than 200 i/os during the 10 second period,
-	we assume that there is free disk i/o capacity available, and it
-	makes sense to flush 100 pages. */
+	/* If i/os during the 10 second period were less than 200% of
+         capacity, we assume that there is free disk i/o capacity
+         available, and it makes sense to flush srv_io_capacity pages.
+
+         Note that this is done regardless of the fraction of dirty
+         pages relative to the max requested by the user. The one second
+         loop above requests writes for that case. The writes done here
+         are not required, and may be disabled. */
 
 	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
 	n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
 						+ buf_pool->n_pages_written;
-	if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
+	if (srv_extra_dirty_writes &&
+            n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) {
 
 		srv_main_thread_op_info = "flushing buffer pool pages";
-		buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
+		buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
 
 		srv_main_thread_op_info = "flushing log";
-		log_buffer_flush_to_disk();
+		/* No fsync when srv_flush_log_at_trx_commit != 1 */
+		log_buffer_flush_maybe_sync();
+		srv_async_flush++;
 	}
 
 	/* We run a batch of insert buffer merge every 10 seconds,
 	even if the server were active */
 
 	srv_main_thread_op_info = "doing insert buffer merge";
-	ibuf_contract_for_n_pages(TRUE, 5);
+	ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
 
 	srv_main_thread_op_info = "flushing log";
-	log_buffer_flush_to_disk();
+	/* No fsync when srv_flush_log_at_trx_commit != 1 */
+	log_buffer_flush_maybe_sync();
+	srv_async_flush++;
 
 	/* We run a full purge every 10 seconds, even if the server
 	were active */
@@ -2378,8 +2446,9 @@
 		if (difftime(current_time, last_flush_time) > 1) {
 			srv_main_thread_op_info = "flushing log";
 
-		        log_buffer_flush_to_disk();
+			log_buffer_flush_to_disk();
 			last_flush_time = current_time;
+			srv_sync_flush++;
 		}
 	}
 	
@@ -2393,14 +2462,14 @@
 		(> 70 %), we assume we can afford reserving the disk(s) for
 		the time it requires to flush 100 pages */
 
-	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
+	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
 							ut_dulint_max);
 	} else {
 	        /* Otherwise, we only flush a small number of pages so that
 		we do not unnecessarily use much disk i/o capacity from
 		other work */
 
-	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
+	        n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
 							ut_dulint_max);
 	}
 
@@ -2434,7 +2503,7 @@
 
 	/* The server has been quiet for a while: start running background
 	operations */
-		
+	srv_main_background_loops++;		
 	srv_main_thread_op_info = "doing background drop tables";
 
 	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
@@ -2472,6 +2541,7 @@
 
 		        log_buffer_flush_to_disk();
 			last_flush_time = current_time;
+                        srv_sync_flush++;
 		}
 	}
 
@@ -2487,9 +2557,13 @@
 	srv_main_thread_op_info = "doing insert buffer merge";
 
 	if (srv_fast_shutdown && srv_shutdown_state > 0) {
-	        n_bytes_merged = 0;
+		n_bytes_merged = 0;
 	} else {
-	        n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
+		/* This should do an amount of IO similar to the number of
+		* dirty pages that will be flushed in the call to
+		* buf_flush_batch below. Otherwise, the system favors
+		* clean pages over cleanup throughput. */
+		n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100));
 	}
 
 	srv_main_thread_op_info = "reserving kernel mutex";
@@ -2503,10 +2577,11 @@
 	
 flush_loop:
 	srv_main_thread_op_info = "flushing buffer pool pages";
+	srv_main_flush_loops++;
 
 	if (srv_fast_shutdown < 2) {
 		n_pages_flushed =
-			buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
+			buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
 	} else {
 		/* In the fastest shutdown we do not flush the buffer pool
 		to data files: we set n_pages_flushed to 0 artificially. */
@@ -2528,7 +2603,17 @@
 
 	srv_main_thread_op_info = "flushing log";
 
-	log_buffer_flush_to_disk();
+	current_time = time(NULL);
+	if (difftime(current_time, last_flush_time) > 1) {
+		srv_main_thread_op_info = (char*) "flushing log";
+		log_buffer_flush_to_disk();
+		last_flush_time = current_time;
+ 		srv_sync_flush++;
+	} else {
+		/* No fsync when srv_flush_log_at_trx_commit != 1 */
+		log_buffer_flush_maybe_sync();
+		srv_async_flush++;
+	}
 
 	srv_main_thread_op_info = "making checkpoint";
 
diff -r 322370200e6a innobase/srv/srv0start.c
--- a/innobase/srv/srv0start.c	Mon Nov 03 05:07:57 2008 -0800
+++ b/innobase/srv/srv0start.c	Mon Nov 03 05:08:52 2008 -0800
@@ -973,6 +973,7 @@
 	ulint	i;
 	ibool	srv_file_per_table_original_value  = srv_file_per_table;
 	mtr_t   mtr;
+	ulint		n_threads;
 #ifdef HAVE_DARWIN_THREADS
 # ifdef F_FULLFSYNC
 	/* This executable has been compiled on Mac OS X 10.3 or later.
@@ -1206,24 +1207,32 @@
 	}
 
 	/* Restrict the maximum number of file i/o threads */
-	if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
-
-		srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
+	if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) {
+		fprintf(stderr,
+			"InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n",
+			srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS);	
+		return(DB_ERROR);
 	}
 
 	if (!os_aio_use_native_aio) {
- 		/* In simulated aio we currently have use only for 4 threads */
-		srv_n_file_io_threads = 4;
+ 		/* More than 4 threads are now supported. */
+		n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD,
+                                        srv_n_read_io_threads,
+                                        srv_n_write_io_threads,
+                                        SRV_MAX_N_PENDING_SYNC_IOS);
+	} else {
+                /* Might need more slots here. Alas, I don't do windows. */
+                n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD,
+                                        srv_n_read_io_threads,
+                                        srv_n_write_io_threads,
+                                        SRV_MAX_N_PENDING_SYNC_IOS);
+	}
 
-		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
-						* srv_n_file_io_threads,
-					srv_n_file_io_threads,
-					SRV_MAX_N_PENDING_SYNC_IOS);
-	} else {
-		os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
-						* srv_n_file_io_threads,
-					srv_n_file_io_threads,
-					SRV_MAX_N_PENDING_SYNC_IOS);
+	if (n_threads > SRV_MAX_N_IO_THREADS) {
+		fprintf(stderr,
+			"InnoDB: requested too many IO threads(%d), max is %d\n",
+			n_threads, SRV_MAX_N_IO_THREADS);	
+		return(DB_ERROR);
 	}
 	
 	fil_init(srv_max_n_open_files);
@@ -1259,11 +1268,11 @@
 
 	/* Create i/o-handler threads: */
 
-	for (i = 0; i < srv_n_file_io_threads; i++) {
+	for (i = 0; i < n_threads; i++) {
 		n[i] = i;
 
 		os_thread_create(io_handler_thread, n + i, thread_ids + i);
-    	}
+	}
 
 #ifdef UNIV_LOG_ARCHIVE
 	if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
diff -r 322370200e6a patch_info/innodb_io_tune.info
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/patch_info/innodb_io_tune.info	Mon Nov 03 05:08:52 2008 -0800
@@ -0,0 +1,9 @@
+File=innodb_io_tune.patch
+Name=Tune InnoDB IO settings
+Version=1.0
+Author=Google
+License=GPL
+Comment=
+ChangeLog=
+2008-11-01 
+VT: Initial porting
diff -r 322370200e6a sql/ha_innodb.cc
--- a/sql/ha_innodb.cc	Mon Nov 03 05:07:57 2008 -0800
+++ b/sql/ha_innodb.cc	Mon Nov 03 05:08:52 2008 -0800
@@ -147,7 +147,7 @@
      innobase_additional_mem_pool_size, innobase_file_io_threads,
      innobase_lock_wait_timeout, innobase_force_recovery,
      innobase_open_files;
-
+long innobase_read_io_threads, innobase_write_io_threads;
 longlong innobase_buffer_pool_size, innobase_log_file_size;
 
 /* The default values for the following char* start-up parameters
@@ -175,6 +175,23 @@
 my_bool innobase_rollback_on_timeout		= FALSE;
 my_bool innobase_create_status_file		= FALSE;
 my_bool innobase_adaptive_hash_index		= TRUE;
+
+/* Max number of IO requests merged to perform large IO in background
+   IO threads.
+*/
+long innobase_max_merged_io = 64;
+
+/* time interval in seconds allowed to calling innodb_show_status functions */
+long innobase_min_status_update_time_interval = 30;
+
+
+/* Default number of IO per second supported by server. Tunes background
+   IO rate
+*/
+long innobase_io_capacity = 100;
+
+/* Write dirty pages when pct dirty is less than max pct dirty */
+my_bool innobase_extra_dirty_writes = TRUE;
 
 static char *internal_innobase_data_file_path	= NULL;
 
@@ -1372,7 +1389,11 @@
 
 	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
 
+        srv_io_capacity = (ulint) innobase_io_capacity;
+        srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes;
 	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
 
 	srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
 	srv_force_recovery = (ulint) innobase_force_recovery;
diff -r 322370200e6a sql/ha_innodb.h
--- a/sql/ha_innodb.h	Mon Nov 03 05:07:57 2008 -0800
+++ b/sql/ha_innodb.h	Mon Nov 03 05:08:52 2008 -0800
@@ -197,6 +197,7 @@
 
 extern struct show_var_st innodb_status_variables[];
 extern ulong innobase_fast_shutdown;
+extern long innobase_max_merged_io;
 extern ulong innobase_large_page_size;
 extern long innobase_mirrored_log_groups, innobase_log_files_in_group;
 extern longlong innobase_buffer_pool_size, innobase_log_file_size;
@@ -205,10 +206,14 @@
 extern long innobase_buffer_pool_awe_mem_mb;
 extern long innobase_file_io_threads, innobase_lock_wait_timeout;
 extern long innobase_force_recovery;
+extern long innobase_read_io_threads, innobase_write_io_threads;
 extern long innobase_open_files;
 extern char *innobase_data_home_dir, *innobase_data_file_path;
 extern char *innobase_log_group_home_dir, *innobase_log_arch_dir;
 extern char *innobase_unix_file_flush_method;
+extern long innobase_io_capacity;
+extern my_bool innobase_extra_dirty_writes;
+
 /* The following variables have to be my_bool for SHOW VARIABLES to work */
 extern my_bool innobase_log_archive,
                innobase_use_doublewrite,
diff -r 322370200e6a sql/mysqld.cc
--- a/sql/mysqld.cc	Mon Nov 03 05:07:57 2008 -0800
+++ b/sql/mysqld.cc	Mon Nov 03 05:08:52 2008 -0800
@@ -4932,6 +4932,11 @@
   OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE,
   OPT_INNODB_MAX_PURGE_LAG,
   OPT_INNODB_FILE_IO_THREADS,
+  OPT_INNODB_READ_IO_THREADS,
+  OPT_INNODB_WRITE_IO_THREADS,
+  OPT_INNODB_MAX_MERGED_IO,
+  OPT_INNODB_IO_CAPACITY,
+  OPT_INNODB_EXTRA_DIRTY_WRITES,
   OPT_INNODB_LOCK_WAIT_TIMEOUT,
   OPT_INNODB_THREAD_CONCURRENCY,
   OPT_INNODB_COMMIT_CONCURRENCY,
@@ -5302,6 +5307,25 @@
    (gptr*) &global_system_variables.innodb_table_locks,
    (gptr*) &global_system_variables.innodb_table_locks,
    0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
+   {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO,
+    "Max number of IO requests merged to issue large IO from background IO threads.",
+    (gptr*) &innobase_max_merged_io,
+    (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0},
+   {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
+    "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads,
+    (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
+   {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
+    "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads,
+    (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
+   {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
+    "Number of IO operations per second the server can do. Tunes background IO rate.",
+    (gptr*) &innobase_io_capacity,
+    (gptr*) &innobase_io_capacity, 0, GET_LONG,
+    REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0},
+   {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES,
+    "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ",
+    (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes,
+    0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0},
 #endif /* End HAVE_INNOBASE_DB */
   {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
    (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
diff -r 322370200e6a sql/set_var.cc
--- a/sql/set_var.cc	Mon Nov 03 05:07:57 2008 -0800
+++ b/sql/set_var.cc	Mon Nov 03 05:08:52 2008 -0800
@@ -919,12 +919,14 @@
   {"innodb_data_home_dir",  (char*) &innobase_data_home_dir,	    SHOW_CHAR_PTR},
   {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL},
   {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL},
+  {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL},
   {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS},
   {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG },
   {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL},
   {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS},
   {"innodb_flush_method",    (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR},
   {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG },
+  {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG },
   {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG },
   {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL},
   {"innodb_log_arch_dir",   (char*) &innobase_log_arch_dir, 	    SHOW_CHAR_PTR},
@@ -943,6 +945,9 @@
   {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
   {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
   {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
+  {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG },
+  {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG },
+  {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG},
 #endif
   {sys_interactive_timeout.name,(char*) &sys_interactive_timeout,   SHOW_SYS},
   {sys_join_buffer_size.name,   (char*) &sys_join_buffer_size,	    SHOW_SYS},