From c62a2810e54ed4ac7b98c75896b614d3ff3eb619 Mon Sep 17 00:00:00 2001
From: Ramon Fernandez <ramon@mongodb.com>
Date: Mon, 29 Feb 2016 14:17:00 -0500
Subject: [PATCH] Import wiredtiger-wiredtiger-mongodb-3.0.9-3-g3dbc6c6.tar.gz
 from wiredtiger branch mongodb-3.0

ref: 62b3ca8..3dbc6c6

WT-2130       Improve on-disk page utlilization with random workloads
SERVER-22898  High fragmentation on WiredTiger databases under write workloads
---
 src/third_party/wiredtiger/src/include/misc.h |  3 +++
 .../wiredtiger/src/reconcile/rec_write.c      | 26 ++++++++++++++-----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
index e2b46d0dbdc..a099213e004 100644
--- a/src/third_party/wiredtiger/src/include/misc.h
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -47,6 +47,9 @@
 #define	WT_ALIGN(n, v)							\
 	((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1))
 
+#define	WT_ALIGN_NEAREST(n, v)						\
+	((((uintmax_t)(n)) + ((v) / 2)) & ~(((uintmax_t)(v)) - 1))
+
 /* Min, max. */
 #define	WT_MIN(a, b)	((a) < (b) ? (a) : (b))
 #define	WT_MAX(a, b)	((a) < (b) ? (b) : (a))
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 67b43057c8a..a2a8a330c1d 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1628,15 +1628,18 @@ __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
 	 * we don't waste space when we write).
 	 */
 	a = maxpagesize;			/* Don't overflow. */
-	split_size = (uint32_t)
-	    WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
+	split_size = (uint32_t)WT_ALIGN_NEAREST(
+	    (a * (u_int)btree->split_pct) / 100, btree->allocsize);
 
 	/*
-	 * If the result of that calculation is the same as the allocation unit
-	 * (that happens if the maximum size is the same size as an allocation
-	 * unit, use a percentage of the maximum page size).
+	 * Respect the configured split percentage if the calculated split
+	 * size is either zero or a full page. The user has either configured
+	 * an allocation size that matches the page size, or a split
+	 * percentage that is close to zero or one hundred. Rounding is going
+	 * to provide a worse outcome than having a split point that doesn't
+	 * fall on an allocation size boundary in those cases.
 	 */
-	if (split_size == btree->allocsize)
+	if (split_size == 0 || split_size == maxpagesize)
 		split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
 
 	return (split_size);
@@ -2957,6 +2960,17 @@ skip_check_complete:
 		}
 	}
 
+	bnd->entries = r->entries;
+	/* Output a verbose message if we create a page without many entries */
+	if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6)
+		WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+		    "Reconciliation creating a page with %" PRIu32
+		    " entries, memory footprint %" PRIu64
+		    ", page count %" PRIu32 ", %s, split state: %d\n",
+		    r->entries, r->page->memory_footprint, r->bnd_next,
+		    F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint",
+		    r->bnd_state));
+
 	WT_ERR(__wt_bt_write(session,
 	    buf, addr, &addr_size, false, bnd->already_compressed));
 	WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
-- 
GitLab