## Thursday, April 7, 2011

### Optimizing Merge Sort

A number of years ago I wrote about the Merge Sort algorithm. One of the advantages of Merge Sort is that it is a stable sort, meaning that elements that compare as being equal remain in their original order after being sorted.

Well, today I had need of employing a stable sorting routine for sorting elements by a ZIndex in Moonlight. Up until today, we had been using qsort() which, while not guaranteed to be a stable sort on any platform, happens to be implemented in glibc as a stable sort except in out-of-memory conditions. Since we'd like Moonlight to work on platforms other than Linux+glibc (such as Mac OS or BSD), it has become important enough to implement properly.

To start, I dusted off my generic MergeSort() implementation from years ago when I was writing articles about various sorting algorithms. This is what I had to start with:

```#define MID(lo, hi) (lo + ((hi - lo) >> 1))

static void
msort (void *array, void *buf, size_t low, size_t high, size_t size,
int (* compare) (const void *, const void *))
{
register char *lo, *hi, *b;
char *al, *am, *ah;
size_t mid;

mid = MID (low, high);

if (mid + 1 < high)
msort (array, buf, mid + 1, high, size, compare);

if (mid > low)
msort (array, buf, low, mid, size, compare);

ah = ((char *) array) + ((high + 1) * size);
am = ((char *) array) + ((mid + 1) * size);
al = ((char *) array) + (low * size);

b = (char *) buf;
lo = al;
hi = am;

while (lo < am && hi < ah) {
if (compare (lo, hi) <= 0) {
memcpy (b, lo, size);
lo += size;
} else {
memcpy (b, hi, size);
hi += size;
}

b += size;
}

if (lo < am)
memcpy (b, lo, am - lo);
else if (hi < ah)
memcpy (b, hi, (ah + size) - hi);

memcpy (al, buf, ah - al);
}

int
MergeSort (void *base, size_t nmemb, size_t size,
int (* compare) (const void *, const void *))
{
void *tmp;

if (nmemb < 2)
return 0;

if (!(tmp = malloc (nmemb * size))) {
errno = ENOMEM;
return -1;
}

msort (base, tmp, 0, nmemb - 1, size, compare);

free (tmp);

return 0;
}
```

Since performance is very important, I clocked this implementation against qsort() and got the following results on my Intel Core2 Quad Q6600 2.4 GHz machine using arrays of 10 million ints:

• Randomized input: 14.13s vs qsort()'s 6.77s
• Sorted input: 4.41s vs qsort()'s 1.54s
• Reversed input: 4.26s vs qsort()'s 1.90s

Clearly the above MergeSort() implementation did not fare well against glibc's qsort() on my system, so it was time to look at what I could do to improve the performance.

The most obvious optimization I could see was to try and batch my memcpy() calls. In other words, instead of calling memcpy() to copy each and every element into our temporary buffer, it'd be more efficient to copy blocks of elements at a time:

```static void
msort (void *array, void *buf, size_t low, size_t high, size_t size,
int (* compare) (const void *, const void *))
{
char *al, *am, *ah, *ls, *hs, *lo, *hi, *b;
size_t mid;

mid = MID (low, high);

if (mid + 1 < high)
msort (array, buf, mid + 1, high, size, compare);

if (mid > low)
msort (array, buf, low, mid, size, compare);

ah = ((char *) array) + ((high + 1) * size);
am = ((char *) array) + ((mid + 1) * size);
al = ((char *) array) + (low * size);

b = (char *) buf;
lo = al;
hi = am;

do {
ls = lo;
hs = hi;

if (lo > al || hi > am) {
/* our last loop already compared lo & hi and found lo <= hi */
lo += size;
}

while (lo < am && compare (lo, hi) <= 0)
lo += size;

if (lo > ls) {
memcpy (b, ls, lo - ls);
b += (lo - ls);
}

if (lo < am) {
/* our last compare tells us hi < lo */
hi += size;

while (hi < ah && compare (hi, lo) < 0)
hi += size;

memcpy (b, hs, hi - hs);
b += (hi - hs);
}
} while (lo < am && hi < ah);

if (lo < am)
memcpy (b, lo, am - lo);
else if (hi < ah)
memcpy (b, hi, ah - hi);

memcpy (al, buf, ah - al);
}
```

The results were promising. For the exact same inputs (including the exact same random array), we now get:

• Randomized input: 10.45s
• Sorted input: 2.08s
• Reversed input: 2.03s

The only other way that we can reduce the number of memcpy() calls we make is to avoid copying leading and trailing elements into our temporary buffer if it's not necessary to merge them. Here's the solution I came up with:

```static void
msort (void *array, void *buf, size_t low, size_t high, size_t size,
int (* compare) (const void *, const void *))
{
char *a1, *al, *am, *ah, *ls, *hs, *lo, *hi, *b;
size_t copied = 0;
size_t mid;

mid = MID (low, high);

if (mid + 1 < high)
msort (array, buf, mid + 1, high, size, compare);

if (mid > low)
msort (array, buf, low, mid, size, compare);

ah = ((char *) array) + ((high + 1) * size);
am = ((char *) array) + ((mid + 1) * size);
a1 = al = ((char *) array) + (low * size);

b = (char *) buf;
lo = al;
hi = am;

do {
ls = lo;
hs = hi;

if (lo > al || hi > am) {
/* our last loop already compared lo & hi and found lo <= hi */
lo += size;
}

while (lo < am && compare (lo, hi) <= 0)
lo += size;

if (lo < am) {
if (copied == 0) {
/* avoid copying the leading items */
a1 = lo;
ls = lo;
}

/* our last compare tells us hi < lo */
hi += size;

while (hi < ah && compare (hi, lo) < 0)
hi += size;

if (lo > ls) {
memcpy (b, ls, lo - ls);
copied += (lo - ls);
b += (lo - ls);
}

memcpy (b, hs, hi - hs);
copied += (hi - hs);
b += (hi - hs);
} else if (copied) {
memcpy (b, ls, lo - ls);
copied += (lo - ls);
b += (lo - ls);

/* copy everything we needed to re-order back into array */
memcpy (a1, buf, copied);
return;
} else {
/* everything already in order */
return;
}
} while (hi < ah);

if (lo < am) {
memcpy (b, lo, am - lo);
copied += (am - lo);
}

memcpy (a1, buf, copied);
}
```

Once again, reducing the amount of copying paid off:

• Randomized input: 9.80s
• Sorted input: 0.95s
• Reversed input: 2.05s

Update 2011-05-18: One final optimization that can be tried is pre-calculating the optimum way to copy elements between buffers. This calculation, while not terribly expensive itself, adds up with every call to memcpy(). Let's start off by writing some handy macros:

```#define COPYBY(TYPE, a, b, n) {         \
long __n = (n) / sizeof (TYPE);     \
register TYPE *__a = (TYPE *) (a);  \
register TYPE *__b = (TYPE *) (b);  \
\
do {                                \
*__a++ = *__b++;                \
} while (--__n > 0);                \
}

#define MEMCOPY(dest, src, n) {                 \
switch (copy_mode) {                        \
case 1: COPYBY (long, dest, src, n); break; \
case 2: COPYBY (int, dest, src, n); break;  \
default: memcpy (dest, src, n);             \
}                                           \
}
```

Now that these handy macros are written, we can plug them into our Merge Sort implementation:

```static void
msort (void *array, void *buf, size_t low, size_t high, size_t size,
int copy_mode, int (* compare) (const void *, const void *))
{
char *a1, *al, *am, *ah, *ls, *hs, *lo, *hi, *b;
size_t copied = 0;
size_t mid;

mid = MID (low, high);

if (mid + 1 < high)
msort (array, buf, mid + 1, high, size, compare);

if (mid > low)
msort (array, buf, low, mid, size, compare);

ah = ((char *) array) + ((high + 1) * size);
am = ((char *) array) + ((mid + 1) * size);
a1 = al = ((char *) array) + (low * size);

b = (char *) buf;
lo = al;
hi = am;

do {
ls = lo;
hs = hi;

if (lo > al || hi > am) {
/* our last loop already compared lo & hi and found lo <= hi */
lo += size;
}

while (lo < am && compare (lo, hi) <= 0)
lo += size;

if (lo < am) {
if (copied == 0) {
/* avoid copying the leading items */
a1 = lo;
ls = lo;
}

/* our last compare tells us hi < lo */
hi += size;

while (hi < ah && compare (hi, lo) < 0)
hi += size;

if (lo > ls) {
MEMCOPY (b, ls, lo - ls);
copied += (lo - ls);
b += (lo - ls);
}

MEMCOPY (b, hs, hi - hs);
copied += (hi - hs);
b += (hi - hs);
} else if (copied) {
MEMCOPY (b, ls, lo - ls);
copied += (lo - ls);
b += (lo - ls);

/* copy everything we needed to re-order back into array */
MEMCOPY (a1, buf, copied);
return;
} else {
/* everything already in order */
return;
}
} while (hi < ah);

if (lo < am) {
MEMCOPY (b, lo, am - lo);
copied += (am - lo);
}

MEMCOPY (a1, buf, copied);
}

int
MergeSort (void *base, size_t nmemb, size_t size,
int (* compare) (const void *, const void *))
{
int copy_mode;
void *tmp;

if (nmemb < 2)
return 0;

if (!(tmp = malloc (nmemb * size))) {
errno = ENOMEM;
return -1;
}

if ((((char *) base) - ((char *) 0)) % sizeof (long) == 0 && (size % sizeof (long)) == 0)
copy_mode = 1;
else if ((((char *) base) - ((char *) 0)) % sizeof (int) == 0 && (size % sizeof (int)) == 0)
copy_mode = 2;
else
copy_mode = 0;

msort (base, tmp, 0, nmemb - 1, size, copy_mode, compare);

free (tmp);

return 0;
}
```

This handy trick seems to have worked out rather well:

• Randomized input: 7.79s
• Sorted input: 0.99s
• Reversed input: 1.69s

At this point, I can't think of any other obvious optimizations so I'm going to call it a day.

For a recap, here are the results of all 4 implementations compared side-by-side with the results from qsort():

 qsort() msort() v1 msort() v2 msort() v3 msort() v4 random: 6.77 14.13 10.45 9.80 7.79 sorted: 1.54 4.41 2.08 0.95 0.99 reversed: 1.90 4.26 2.03 2.05 1.69