/* Copyright (c) 2010-2019 Christopher Swenson. */
/* Copyright (c) 2012 Vojtech Fried. */
/* Copyright (c) 2012 Google Inc. All Rights Reserved. */
/* https://github.com/swenson/sort */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>

#ifndef SORT_NAME
#error "Must declare SORT_NAME"
#endif

#ifndef SORT_TYPE
#error "Must declare SORT_TYPE"
#endif

#ifndef SORT_CMP
#define SORT_CMP(x, y)  ((x) < (y) ? -1 : ((y) < (x) ? 1 : 0))
#endif

#ifdef __cplusplus
#ifndef SORT_SAFE_CPY
#define SORT_SAFE_CPY 0
#endif
#else
#undef SORT_SAFE_CPY
#define SORT_SAFE_CPY 0
#endif

#ifndef TIM_SORT_STACK_SIZE
#define TIM_SORT_STACK_SIZE 128
#endif

#ifndef TIM_SORT_MIN_GALLOP
#define TIM_SORT_MIN_GALLOP 7
#endif

#ifndef SORT_SWAP
#define SORT_SWAP(x,y) {SORT_TYPE _sort_swap_temp = (x); (x) = (y); (y) = _sort_swap_temp;}
#endif

/* Common, type-agnostic functions and constants that we don't want to declare twice. */
#ifndef SORT_COMMON_H
#define SORT_COMMON_H

#ifndef MAX
#define MAX(x,y) (((x) > (y) ? (x) : (y)))
#endif

#ifndef MIN
#define MIN(x,y) (((x) < (y) ? (x) : (y)))
#endif

static int compute_minrun(const uint64_t);

/* From http://oeis.org/classic/A102549 */
static const uint64_t shell_gaps[48] = {1, 4, 10, 23, 57, 132, 301, 701, 1750, 4376, 10941, 27353, 68383, 170958, 427396, 1068491, 2671228, 6678071, 16695178, 41737946, 104344866, 260862166, 652155416, 1630388541, 4075971353LL, 10189928383LL, 25474820958LL, 63687052396LL, 159217630991LL, 398044077478LL, 995110193696LL, 2487775484241LL, 6219438710603LL, 15548596776508LL, 38871491941271LL, 97178729853178LL, 242946824632946LL, 607367061582366LL, 1518417653955916LL, 3796044134889791LL, 9490110337224478LL, 23725275843061196LL, 59313189607652991LL, 148282974019132478LL, 370707435047831196LL, 926768587619577991LL, 2316921469048944978LL, 5792303672622362446LL};

#ifndef CLZ
/* clang-only */
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if __has_builtin(__builtin_clzll) || (defined(__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ > 3)))
#define CLZ __builtin_clzll
#else

static int clzll(uint64_t);

/* adapted from Hacker's Delight */
static int clzll(uint64_t x)
{
    int n;

    if (x == 0) {
        return 64;
    }

    n = 0;

    if (x <= 0x00000000FFFFFFFFL) {
        n = n + 32;
        x = x << 32;
    }

    if (x <= 0x0000FFFFFFFFFFFFL) {
        n = n + 16;
        x = x << 16;
    }

    if (x <= 0x00FFFFFFFFFFFFFFL) {
        n = n + 8;
        x = x << 8;
    }

    if (x <= 0x0FFFFFFFFFFFFFFFL) {
        n = n + 4;
        x = x << 4;
    }

    if (x <= 0x3FFFFFFFFFFFFFFFL) {
        n = n + 2;
        x = x << 2;
    }

    if (x <= 0x7FFFFFFFFFFFFFFFL) {
        n = n + 1;
    }

    return n;
}

#define CLZ clzll
#endif
#endif

static __inline int compute_minrun(const uint64_t size)
{
    const int top_bit = 64 - CLZ(size);
    const int shift = MAX(top_bit, 6) - 6;
    const int minrun = (int)(size >> shift);
    const uint64_t mask = (1ULL << shift) - 1;

    if (mask & size) {
        return minrun + 1;
    }

    return minrun;
}

static __inline size_t rbnd(size_t len)
{
    int k;

    if (len < 16) {
        return 2;
    }

    k = 62 - CLZ(len);
    return 1ULL << ((2 * k) / 3);
}

#endif /* SORT_COMMON_H */

#define SORT_CONCAT(x, y) x ## _ ## y
#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y)
#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x)

#ifndef SMALL_SORT_BND
#define SMALL_SORT_BND 16
#endif
#ifndef SMALL_SORT
#define SMALL_SORT BITONIC_SORT
/*#define SMALL_SORT BINARY_INSERTION_SORT*/
#endif

#define SORT_TYPE_CPY                  SORT_MAKE_STR(sort_type_cpy)
#define SORT_TYPE_MOVE                 SORT_MAKE_STR(sort_type_move)
#define SORT_NEW_BUFFER                SORT_MAKE_STR(sort_new_buffer)
#define SORT_DELETE_BUFFER             SORT_MAKE_STR(sort_delete_buffer)
#define BITONIC_SORT                   SORT_MAKE_STR(bitonic_sort)
#define BINARY_INSERTION_FIND          SORT_MAKE_STR(binary_insertion_find)
#define BINARY_INSERTION_SORT_START    SORT_MAKE_STR(binary_insertion_sort_start)
#define BINARY_INSERTION_SORT          SORT_MAKE_STR(binary_insertion_sort)
#define REVERSE_ELEMENTS               SORT_MAKE_STR(reverse_elements)
#define COUNT_RUN                      SORT_MAKE_STR(count_run)
#define CHECK_INVARIANT                SORT_MAKE_STR(check_invariant)
#define TIM_SORT                       SORT_MAKE_STR(tim_sort)
#define TIM_SORT_GALLOP                SORT_MAKE_STR(tim_sort_gallop)
#define TIM_SORT_RESIZE                SORT_MAKE_STR(tim_sort_resize)
#define TIM_SORT_MERGE                 SORT_MAKE_STR(tim_sort_merge)
#define TIM_SORT_MERGE_LEFT            SORT_MAKE_STR(tim_sort_merge_left)
#define TIM_SORT_MERGE_RIGHT           SORT_MAKE_STR(tim_sort_merge_right)
#define TIM_SORT_COLLAPSE              SORT_MAKE_STR(tim_sort_collapse)
#define HEAP_SORT                      SORT_MAKE_STR(heap_sort)
#define MEDIAN                         SORT_MAKE_STR(median)
#define QUICK_SORT                     SORT_MAKE_STR(quick_sort)
#define MERGE_SORT                     SORT_MAKE_STR(merge_sort)
#define MERGE_SORT_RECURSIVE           SORT_MAKE_STR(merge_sort_recursive)
#define MERGE_SORT_IN_PLACE            SORT_MAKE_STR(merge_sort_in_place)
#define MERGE_SORT_IN_PLACE_RMERGE     SORT_MAKE_STR(merge_sort_in_place_rmerge)
#define MERGE_SORT_IN_PLACE_BACKMERGE  SORT_MAKE_STR(merge_sort_in_place_backmerge)
#define MERGE_SORT_IN_PLACE_FRONTMERGE SORT_MAKE_STR(merge_sort_in_place_frontmerge)
#define MERGE_SORT_IN_PLACE_ASWAP      SORT_MAKE_STR(merge_sort_in_place_aswap)
#define SELECTION_SORT                 SORT_MAKE_STR(selection_sort)
#define SHELL_SORT                     SORT_MAKE_STR(shell_sort)
#define QUICK_SORT_PARTITION           SORT_MAKE_STR(quick_sort_partition)
#define QUICK_SORT_RECURSIVE           SORT_MAKE_STR(quick_sort_recursive)
#define HEAP_SIFT_DOWN                 SORT_MAKE_STR(heap_sift_down)
#define HEAPIFY                        SORT_MAKE_STR(heapify)
#define TIM_SORT_RUN_T                 SORT_MAKE_STR(tim_sort_run_t)
#define TEMP_STORAGE_T                 SORT_MAKE_STR(temp_storage_t)
#define PUSH_NEXT                      SORT_MAKE_STR(push_next)
#define GRAIL_SWAP1                    SORT_MAKE_STR(grail_swap1)
#define REC_STABLE_SORT                SORT_MAKE_STR(rec_stable_sort)
#define GRAIL_REC_MERGE                SORT_MAKE_STR(grail_rec_merge)
#define GRAIL_SORT_DYN_BUFFER          SORT_MAKE_STR(grail_sort_dyn_buffer)
#define GRAIL_SORT_FIXED_BUFFER        SORT_MAKE_STR(grail_sort_fixed_buffer)
#define GRAIL_COMMON_SORT              SORT_MAKE_STR(grail_common_sort)
#define GRAIL_SORT                     SORT_MAKE_STR(grail_sort)
#define GRAIL_COMBINE_BLOCKS           SORT_MAKE_STR(grail_combine_blocks)
#define GRAIL_LAZY_STABLE_SORT         SORT_MAKE_STR(grail_lazy_stable_sort)
#define GRAIL_MERGE_WITHOUT_BUFFER     SORT_MAKE_STR(grail_merge_without_buffer)
#define GRAIL_ROTATE                   SORT_MAKE_STR(grail_rotate)
#define GRAIL_BIN_SEARCH_LEFT          SORT_MAKE_STR(grail_bin_search_left)
#define GRAIL_BUILD_BLOCKS             SORT_MAKE_STR(grail_build_blocks)
#define GRAIL_FIND_KEYS                SORT_MAKE_STR(grail_find_keys)
#define GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(grail_merge_buffers_left_with_x_buf)
#define GRAIL_BIN_SEARCH_RIGHT         SORT_MAKE_STR(grail_bin_search_right)
#define GRAIL_MERGE_BUFFERS_LEFT       SORT_MAKE_STR(grail_merge_buffers_left)
#define GRAIL_SMART_MERGE_WITH_X_BUF   SORT_MAKE_STR(grail_smart_merge_with_x_buf)
#define GRAIL_MERGE_LEFT_WITH_X_BUF    SORT_MAKE_STR(grail_merge_left_with_x_buf)
#define GRAIL_SMART_MERGE_WITHOUT_BUFFER SORT_MAKE_STR(grail_smart_merge_without_buffer)
#define GRAIL_SMART_MERGE_WITH_BUFFER  SORT_MAKE_STR(grail_smart_merge_with_buffer)
#define GRAIL_MERGE_RIGHT              SORT_MAKE_STR(grail_merge_right)
#define GRAIL_MERGE_LEFT               SORT_MAKE_STR(grail_merge_left)
#define GRAIL_SWAP_N                   SORT_MAKE_STR(grail_swap_n)
#define SQRT_SORT                      SORT_MAKE_STR(sqrt_sort)
#define SQRT_SORT_BUILD_BLOCKS         SORT_MAKE_STR(sqrt_sort_build_blocks)
#define SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_buffers_left_with_x_buf)
#define SQRT_SORT_MERGE_DOWN           SORT_MAKE_STR(sqrt_sort_merge_down)
#define SQRT_SORT_MERGE_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_left_with_x_buf)
#define SQRT_SORT_MERGE_RIGHT          SORT_MAKE_STR(sqrt_sort_merge_right)
#define SQRT_SORT_SWAP_N               SORT_MAKE_STR(sqrt_sort_swap_n)
#define SQRT_SORT_SWAP_1               SORT_MAKE_STR(sqrt_sort_swap_1)
#define SQRT_SORT_SMART_MERGE_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_smart_merge_with_x_buf)
#define SQRT_SORT_SORT_INS             SORT_MAKE_STR(sqrt_sort_sort_ins)
#define SQRT_SORT_COMBINE_BLOCKS       SORT_MAKE_STR(sqrt_sort_combine_blocks)
#define SQRT_SORT_COMMON_SORT          SORT_MAKE_STR(sqrt_sort_common_sort)
#define BUBBLE_SORT                    SORT_MAKE_STR(bubble_sort)

#ifndef MAX
#define MAX(x,y) (((x) > (y) ? (x) : (y)))
#endif
#ifndef MIN
#define MIN(x,y) (((x) < (y) ? (x) : (y)))
#endif
#ifndef SORT_CSWAP
#define SORT_CSWAP(x, y) { if(SORT_CMP((x),(y)) > 0) {SORT_SWAP((x),(y));}}
#endif

typedef struct {
    size_t start;
    size_t length;
} TIM_SORT_RUN_T;


void SHELL_SORT(SORT_TYPE *dst, const size_t size);
void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size);
void HEAP_SORT(SORT_TYPE *dst, const size_t size);
void QUICK_SORT(SORT_TYPE *dst, const size_t size);
void MERGE_SORT(SORT_TYPE *dst, const size_t size);
void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t size);
void SELECTION_SORT(SORT_TYPE *dst, const size_t size);
void TIM_SORT(SORT_TYPE *dst, const size_t size);
void BUBBLE_SORT(SORT_TYPE *dst, const size_t size);
void BITONIC_SORT(SORT_TYPE *dst, const size_t size);
void REC_STABLE_SORT(SORT_TYPE *dst, const size_t size);
void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *dst, const size_t size);
void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *dst, const size_t size);
void GRAIL_SORT(SORT_TYPE *dst, const size_t size);
void SQRT_SORT(SORT_TYPE *dst, const size_t size);

/* The full implementation of a bitonic sort is not here. Since we only want to use
   sorting networks for small length lists we create optimal sorting networks for
   lists of length <= 16 and call out to BINARY_INSERTION_SORT for anything larger
   than 16.
   Optimal sorting networks for small length lists.
   Taken from https://pages.ripco.net/~jgamble/nw.html */
#define BITONIC_SORT_2          SORT_MAKE_STR(bitonic_sort_2)
static __inline void BITONIC_SORT_2(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
}


#define BITONIC_SORT_3          SORT_MAKE_STR(bitonic_sort_3)
static __inline void BITONIC_SORT_3(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[0], dst[1]);
}


#define BITONIC_SORT_4          SORT_MAKE_STR(bitonic_sort_4)
static __inline void BITONIC_SORT_4(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[1], dst[2]);
}


#define BITONIC_SORT_5          SORT_MAKE_STR(bitonic_sort_5)
static __inline void BITONIC_SORT_5(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[1], dst[2]);
}


#define BITONIC_SORT_6          SORT_MAKE_STR(bitonic_sort_6)
static __inline void BITONIC_SORT_6(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[2], dst[3]);
}


#define BITONIC_SORT_7          SORT_MAKE_STR(bitonic_sort_7)
static __inline void BITONIC_SORT_7(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[2], dst[3]);
}


#define BITONIC_SORT_8          SORT_MAKE_STR(bitonic_sort_8)
static __inline void BITONIC_SORT_8(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[3], dst[6]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[3], dst[4]);
}


#define BITONIC_SORT_9          SORT_MAKE_STR(bitonic_sort_9)
static __inline void BITONIC_SORT_9(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[5], dst[8]);
    SORT_CSWAP(dst[3], dst[6]);
    SORT_CSWAP(dst[4], dst[7]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[2], dst[3]);
}


#define BITONIC_SORT_10          SORT_MAKE_STR(bitonic_sort_10)
static __inline void BITONIC_SORT_10(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[4], dst[9]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[2], dst[7]);
    SORT_CSWAP(dst[1], dst[6]);
    SORT_CSWAP(dst[0], dst[5]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[6], dst[9]);
    SORT_CSWAP(dst[0], dst[3]);
    SORT_CSWAP(dst[5], dst[8]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[3], dst[6]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[4], dst[7]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[4], dst[5]);
}


#define BITONIC_SORT_11          SORT_MAKE_STR(bitonic_sort_11)
static __inline void BITONIC_SORT_11(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[6], dst[10]);
    SORT_CSWAP(dst[4], dst[8]);
    SORT_CSWAP(dst[5], dst[9]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[6], dst[10]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[7], dst[10]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[7], dst[8]);
}


#define BITONIC_SORT_12          SORT_MAKE_STR(bitonic_sort_12)
static __inline void BITONIC_SORT_12(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[7], dst[11]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[6], dst[10]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[4], dst[8]);
    SORT_CSWAP(dst[5], dst[9]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[7], dst[11]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[6], dst[10]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[7], dst[10]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[7], dst[8]);
}


#define BITONIC_SORT_13          SORT_MAKE_STR(bitonic_sort_13)
static __inline void BITONIC_SORT_13(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[1], dst[7]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[8]);
    SORT_CSWAP(dst[0], dst[12]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[11]);
    SORT_CSWAP(dst[7], dst[12]);
    SORT_CSWAP(dst[5], dst[9]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[6], dst[12]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[11], dst[12]);
    SORT_CSWAP(dst[4], dst[9]);
    SORT_CSWAP(dst[6], dst[10]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[1], dst[7]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[4], dst[7]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[0], dst[5]);
    SORT_CSWAP(dst[2], dst[5]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
}


#define BITONIC_SORT_14          SORT_MAKE_STR(bitonic_sort_14)
static __inline void BITONIC_SORT_14(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[12], dst[13]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[8], dst[12]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[9], dst[13]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[0], dst[8]);
    SORT_CSWAP(dst[1], dst[9]);
    SORT_CSWAP(dst[2], dst[10]);
    SORT_CSWAP(dst[3], dst[11]);
    SORT_CSWAP(dst[4], dst[12]);
    SORT_CSWAP(dst[5], dst[13]);
    SORT_CSWAP(dst[5], dst[10]);
    SORT_CSWAP(dst[6], dst[9]);
    SORT_CSWAP(dst[3], dst[12]);
    SORT_CSWAP(dst[7], dst[11]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[8]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[7], dst[13]);
    SORT_CSWAP(dst[2], dst[8]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[11], dst[13]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[7], dst[12]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[10], dst[12]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[11], dst[12]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
}


#define BITONIC_SORT_15          SORT_MAKE_STR(bitonic_sort_15)
static __inline void BITONIC_SORT_15(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[12], dst[13]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[12], dst[14]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[8], dst[12]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[9], dst[13]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[10], dst[14]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[0], dst[8]);
    SORT_CSWAP(dst[1], dst[9]);
    SORT_CSWAP(dst[2], dst[10]);
    SORT_CSWAP(dst[3], dst[11]);
    SORT_CSWAP(dst[4], dst[12]);
    SORT_CSWAP(dst[5], dst[13]);
    SORT_CSWAP(dst[6], dst[14]);
    SORT_CSWAP(dst[5], dst[10]);
    SORT_CSWAP(dst[6], dst[9]);
    SORT_CSWAP(dst[3], dst[12]);
    SORT_CSWAP(dst[13], dst[14]);
    SORT_CSWAP(dst[7], dst[11]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[8]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[7], dst[13]);
    SORT_CSWAP(dst[2], dst[8]);
    SORT_CSWAP(dst[11], dst[14]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[11], dst[13]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[7], dst[12]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[10], dst[12]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[11], dst[12]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
}


#define BITONIC_SORT_16          SORT_MAKE_STR(bitonic_sort_16)
static __inline void BITONIC_SORT_16(SORT_TYPE *dst)
{
    SORT_CSWAP(dst[0], dst[1]);
    SORT_CSWAP(dst[2], dst[3]);
    SORT_CSWAP(dst[4], dst[5]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
    SORT_CSWAP(dst[10], dst[11]);
    SORT_CSWAP(dst[12], dst[13]);
    SORT_CSWAP(dst[14], dst[15]);
    SORT_CSWAP(dst[0], dst[2]);
    SORT_CSWAP(dst[4], dst[6]);
    SORT_CSWAP(dst[8], dst[10]);
    SORT_CSWAP(dst[12], dst[14]);
    SORT_CSWAP(dst[1], dst[3]);
    SORT_CSWAP(dst[5], dst[7]);
    SORT_CSWAP(dst[9], dst[11]);
    SORT_CSWAP(dst[13], dst[15]);
    SORT_CSWAP(dst[0], dst[4]);
    SORT_CSWAP(dst[8], dst[12]);
    SORT_CSWAP(dst[1], dst[5]);
    SORT_CSWAP(dst[9], dst[13]);
    SORT_CSWAP(dst[2], dst[6]);
    SORT_CSWAP(dst[10], dst[14]);
    SORT_CSWAP(dst[3], dst[7]);
    SORT_CSWAP(dst[11], dst[15]);
    SORT_CSWAP(dst[0], dst[8]);
    SORT_CSWAP(dst[1], dst[9]);
    SORT_CSWAP(dst[2], dst[10]);
    SORT_CSWAP(dst[3], dst[11]);
    SORT_CSWAP(dst[4], dst[12]);
    SORT_CSWAP(dst[5], dst[13]);
    SORT_CSWAP(dst[6], dst[14]);
    SORT_CSWAP(dst[7], dst[15]);
    SORT_CSWAP(dst[5], dst[10]);
    SORT_CSWAP(dst[6], dst[9]);
    SORT_CSWAP(dst[3], dst[12]);
    SORT_CSWAP(dst[13], dst[14]);
    SORT_CSWAP(dst[7], dst[11]);
    SORT_CSWAP(dst[1], dst[2]);
    SORT_CSWAP(dst[4], dst[8]);
    SORT_CSWAP(dst[1], dst[4]);
    SORT_CSWAP(dst[7], dst[13]);
    SORT_CSWAP(dst[2], dst[8]);
    SORT_CSWAP(dst[11], dst[14]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[2], dst[4]);
    SORT_CSWAP(dst[11], dst[13]);
    SORT_CSWAP(dst[3], dst[8]);
    SORT_CSWAP(dst[7], dst[12]);
    SORT_CSWAP(dst[6], dst[8]);
    SORT_CSWAP(dst[10], dst[12]);
    SORT_CSWAP(dst[3], dst[5]);
    SORT_CSWAP(dst[7], dst[9]);
    SORT_CSWAP(dst[3], dst[4]);
    SORT_CSWAP(dst[5], dst[6]);
    SORT_CSWAP(dst[7], dst[8]);
    SORT_CSWAP(dst[9], dst[10]);
    SORT_CSWAP(dst[11], dst[12]);
    SORT_CSWAP(dst[6], dst[7]);
    SORT_CSWAP(dst[8], dst[9]);
}

void BITONIC_SORT(SORT_TYPE *dst, const size_t size)
{
    switch (size) {
    case 0:
    case 1:
        break;

    case 2:
        BITONIC_SORT_2(dst);
        break;

    case 3:
        BITONIC_SORT_3(dst);
        break;

    case 4:
        BITONIC_SORT_4(dst);
        break;

    case 5:
        BITONIC_SORT_5(dst);
        break;

    case 6:
        BITONIC_SORT_6(dst);
        break;

    case 7:
        BITONIC_SORT_7(dst);
        break;

    case 8:
        BITONIC_SORT_8(dst);
        break;

    case 9:
        BITONIC_SORT_9(dst);
        break;

    case 10:
        BITONIC_SORT_10(dst);
        break;

    case 11:
        BITONIC_SORT_11(dst);
        break;

    case 12:
        BITONIC_SORT_12(dst);
        break;

    case 13:
        BITONIC_SORT_13(dst);
        break;

    case 14:
        BITONIC_SORT_14(dst);
        break;

    case 15:
        BITONIC_SORT_15(dst);
        break;

    case 16:
        BITONIC_SORT_16(dst);
        break;

    default:
        BINARY_INSERTION_SORT(dst, size);
    }
}

#if SORT_SAFE_CPY

void SORT_TYPE_CPY(SORT_TYPE *dst, SORT_TYPE *src, const size_t size)
{
    size_t i = 0;

    for (; i < size; ++i) {
        dst[i] = src[i];
    }
}

void SORT_TYPE_MOVE(SORT_TYPE *dst, SORT_TYPE *src, const size_t size)
{
    size_t i;

    if (dst < src) {
        SORT_TYPE_CPY(dst, src, size);
    } else if (dst != src && size > 0) {
        for (i = size - 1; i > 0; --i) {
            dst[i] = src[i];
        }

        *dst = *src;
    }
}

#else

#undef SORT_TYPE_CPY
#define SORT_TYPE_CPY(dst, src, size) memcpy((dst), (src), (size) * sizeof(SORT_TYPE))
#undef SORT_TYPE_MOVE
#define SORT_TYPE_MOVE(dst, src, size) memmove((dst), (src), (size) * sizeof(SORT_TYPE))

#endif

SORT_TYPE* SORT_NEW_BUFFER(size_t size)
{
#if SORT_SAFE_CPY
    return new SORT_TYPE[size];
#else
    return (SORT_TYPE*)malloc(size * sizeof(SORT_TYPE));
#endif
}

void SORT_DELETE_BUFFER(SORT_TYPE* pointer)
{
#if SORT_SAFE_CPY
    delete[] pointer;
#else
    free(pointer);
#endif
}


/* Shell sort implementation based on Wikipedia article
   http://en.wikipedia.org/wiki/Shell_sort
*/
void SHELL_SORT(SORT_TYPE *dst, const size_t size)
{
    /* don't bother sorting an array of size 0 or 1 */
    /* TODO: binary search to find first gap? */
    int inci = 47;
    size_t inc = shell_gaps[inci];
    size_t i;

    if (size <= 1) {
        return;
    }

    while (inc > (size >> 1)) {
        inc = shell_gaps[--inci];
    }

    while (1) {
        for (i = inc; i < size; i++) {
            SORT_TYPE temp = dst[i];
            size_t j = i;

            while ((j >= inc) && (SORT_CMP(dst[j - inc], temp) > 0)) {
                dst[j] = dst[j - inc];
                j -= inc;
            }

            dst[j] = temp;
        }

        if (inc == 1) {
            break;
        }

        inc = shell_gaps[--inci];
    }
}

/* Function used to do a binary search for binary insertion sort */
static __inline size_t BINARY_INSERTION_FIND(SORT_TYPE *dst, const SORT_TYPE x,
        const size_t size)
{
    size_t l, c, r;
    SORT_TYPE cx;
    l = 0;
    r = size - 1;
    c = r >> 1;

    /* check for out of bounds at the beginning. */
    if (SORT_CMP(x, dst[0]) < 0) {
        return 0;
    } else if (SORT_CMP(x, dst[r]) > 0) {
        return r;
    }

    cx = dst[c];

    while (1) {
        const int val = SORT_CMP(x, cx);

        if (val < 0) {
            if (c - l <= 1) {
                return c;
            }

            r = c;
        } else { /* allow = for stability. The binary search favors the right. */
            if (r - c <= 1) {
                return c + 1;
            }

            l = c;
        }

        c = l + ((r - l) >> 1);
        cx = dst[c];
    }
}

/* Binary insertion sort, but knowing that the first "start" entries are sorted.  Used in timsort. */
static void BINARY_INSERTION_SORT_START(SORT_TYPE *dst, const size_t start, const size_t size)
{
    size_t i;

    for (i = start; i < size; i++) {
        size_t j;
        SORT_TYPE x;
        size_t location;

        /* If this entry is already correct, just move along */
        if (SORT_CMP(dst[i - 1], dst[i]) <= 0) {
            continue;
        }

        /* Else we need to find the right place, shift everything over, and squeeze in */
        x = dst[i];
        location = BINARY_INSERTION_FIND(dst, x, i);

        for (j = i - 1; j >= location; j--) {
            dst[j + 1] = dst[j];

            if (j == 0) { /* check edge case because j is unsigned */
                break;
            }
        }

        dst[location] = x;
    }
}

/* Binary insertion sort */
void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size)
{
    /* don't bother sorting an array of size <= 1 */
    if (size <= 1) {
        return;
    }

    BINARY_INSERTION_SORT_START(dst, 1, size);
}

/* Selection sort */
void SELECTION_SORT(SORT_TYPE *dst, const size_t size)
{
    size_t i, j;

    /* don't bother sorting an array of size <= 1 */
    if (size <= 1) {
        return;
    }

    for (i = 0; i < size; i++) {
        for (j = i + 1; j < size; j++) {
            if (SORT_CMP(dst[j], dst[i]) < 0) {
                SORT_SWAP(dst[i], dst[j]);
            }
        }
    }
}

/* In-place mergesort */
void MERGE_SORT_IN_PLACE_ASWAP(SORT_TYPE * dst1, SORT_TYPE * dst2, size_t len)
{
    do {
        SORT_SWAP(*dst1, *dst2);
        dst1++;
        dst2++;
    } while (--len);
}

void MERGE_SORT_IN_PLACE_FRONTMERGE(SORT_TYPE *dst1, size_t l1, SORT_TYPE *dst2, size_t l2)
{
    SORT_TYPE *dst0 = dst2 - l1;

    if (SORT_CMP(dst1[l1 - 1], dst2[0]) <= 0) {
        MERGE_SORT_IN_PLACE_ASWAP(dst1, dst0, l1);
        return;
    }

    do {
        while (SORT_CMP(*dst2, *dst1) > 0) {
            SORT_SWAP(*dst1, *dst0);
            dst1++;
            dst0++;

            if (--l1 == 0) {
                return;
            }
        }

        SORT_SWAP(*dst2, *dst0);
        dst2++;
        dst0++;
    } while (--l2);

    do {
        SORT_SWAP(*dst1, *dst0);
        dst1++;
        dst0++;
    } while (--l1);
}

size_t MERGE_SORT_IN_PLACE_BACKMERGE(SORT_TYPE * dst1, size_t l1, SORT_TYPE * dst2, size_t l2)
{
    size_t res;
    SORT_TYPE *dst0 = dst2 + l1;

    if (SORT_CMP(dst1[1 - l1], dst2[0]) >= 0) {
        MERGE_SORT_IN_PLACE_ASWAP(dst1 - l1 + 1, dst0 - l1 + 1, l1);
        return l1;
    }

    do {
        while (SORT_CMP(*dst2, *dst1) < 0) {
            SORT_SWAP(*dst1, *dst0);
            dst1--;
            dst0--;

            if (--l1 == 0) {
                return 0;
            }
        }

        SORT_SWAP(*dst2, *dst0);
        dst2--;
        dst0--;
    } while (--l2);

    res = l1;

    do {
        SORT_SWAP(*dst1, *dst0);
        dst1--;
        dst0--;
    } while (--l1);

    return res;
}

/* merge dst[p0..p1) by buffer dst[p1..p1+r) */
void MERGE_SORT_IN_PLACE_RMERGE(SORT_TYPE *dst, size_t len, size_t lp, size_t r)
{
    size_t i, lq;
    int cv;

    if (SORT_CMP(dst[lp], dst[lp - 1]) >= 0) {
        return;
    }

    lq = lp;

    for (i = 0; i < len; i += r) {
        /* select smallest dst[p0+n*r] */
        size_t q = i, j;

        for (j = lp; j <= lq; j += r) {
            cv = SORT_CMP(dst[j], dst[q]);

            if (cv == 0) {
                cv = SORT_CMP(dst[j + r - 1], dst[q + r - 1]);
            }

            if (cv < 0) {
                q = j;
            }
        }

        if (q != i) {
            MERGE_SORT_IN_PLACE_ASWAP(dst + i, dst + q, r); /* swap it with current position */

            if (q == lq && q < (len - r)) {
                lq += r;
            }
        }

        if (i != 0 && SORT_CMP(dst[i], dst[i - 1]) < 0) {
            MERGE_SORT_IN_PLACE_ASWAP(dst + len, dst + i, r); /* swap current position with buffer */
            MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len + r - 1), r, dst + (i - 1),
                                          r);  /* buffer :merge: dst[i-r..i) -> dst[i-r..i+r) */
        }

        if (lp == i) {
            lp += r;
        }
    }
}

/* In-place Merge Sort implementation. (c)2012, Andrey Astrelin, astrelin@tochka.ru */
void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t len)
{
    /* don't bother sorting an array of size <= 1 */
    size_t r = rbnd(len);
    size_t lr = (len / r - 1) * r;
    SORT_TYPE *dst1 = dst - 1;
    size_t p, m, q, q1, p0;

    if (len <= 1) {
        return;
    }

    if (len <= SMALL_SORT_BND) {
        SMALL_SORT(dst, len);
        return;
    }

    for (p = 2; p <= lr; p += 2) {
        dst1 += 2;

        if (SORT_CMP(dst1[0], dst1[-1]) < 0) {
            SORT_SWAP(dst1[0], dst1[-1]);
        }

        if (p & 2) {
            continue;
        }

        m = len - p;
        q = 2;

        while ((p & q) == 0) {
            if (SORT_CMP(dst1[1 - q], dst1[-(int) q]) < 0) {
                break;
            }

            q *= 2;
        }

        if (p & q) {
            continue;
        }

        if (q < m) {
            p0 = len - q;
            MERGE_SORT_IN_PLACE_ASWAP(dst + p - q, dst + p0, q);

            for (;;) {
                q1 = 2 * q;

                if ((q1 > m) || (p & q1)) {
                    break;
                }

                p0 = len - q1;
                MERGE_SORT_IN_PLACE_FRONTMERGE(dst + (p - q1), q, dst + p0 + q, q);
                q = q1;
            }

            MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len - 1), q, dst1 - q, q);
            q *= 2;
        }

        q1 = q;

        while (q1 > m) {
            q1 /= 2;
        }

        while ((q & p) == 0) {
            q *= 2;
            MERGE_SORT_IN_PLACE_RMERGE(dst + (p - q), q, q / 2, q1);
        }
    }

    q1 = 0;

    for (q = r; q < lr; q *= 2) {
        if ((lr & q) != 0) {
            q1 += q;

            if (q1 != q) {
                MERGE_SORT_IN_PLACE_RMERGE(dst + (lr - q1), q1, q, r);
            }
        }
    }

    m = len - lr;
    MERGE_SORT_IN_PLACE(dst + lr, m);
    MERGE_SORT_IN_PLACE_ASWAP(dst, dst + lr, m);
    m += MERGE_SORT_IN_PLACE_BACKMERGE(dst + (m - 1), m, dst + (lr - 1), lr - m);
    MERGE_SORT_IN_PLACE(dst, m);
}

/* Standard merge sort */
void MERGE_SORT_RECURSIVE(SORT_TYPE *newdst, SORT_TYPE *dst, const size_t size)
{
    const size_t middle = size / 2;
    size_t out = 0;
    size_t i = 0;
    size_t j = middle;

    /* don't bother sorting an array of size <= 1 */
    if (size <= 1) {
        return;
    }

    if (size <= SMALL_SORT_BND) {
        BINARY_INSERTION_SORT(dst, size);
        return;
    }

    MERGE_SORT_RECURSIVE(newdst, dst, middle);
    MERGE_SORT_RECURSIVE(newdst, &dst[middle], size - middle);

    while (out != size) {
        if (i < middle) {
            if (j < size) {
                if (SORT_CMP(dst[i], dst[j]) <= 0) {
                    newdst[out] = dst[i++];
                } else {
                    newdst[out] = dst[j++];
                }
            } else {
                newdst[out] = dst[i++];
            }
        } else {
            newdst[out] = dst[j++];
        }

        out++;
    }

    SORT_TYPE_CPY(dst, newdst, size);
}

/* Standard merge sort */
void MERGE_SORT(SORT_TYPE *dst, const size_t size)
{
    SORT_TYPE *newdst;

    /* don't bother sorting an array of size <= 1 */
    if (size <= 1) {
        return;
    }

    if (size <= SMALL_SORT_BND) {
        BINARY_INSERTION_SORT(dst, size);
        return;
    }

    newdst = SORT_NEW_BUFFER(size);
    MERGE_SORT_RECURSIVE(newdst, dst, size);
    SORT_DELETE_BUFFER(newdst);
}


static __inline size_t QUICK_SORT_PARTITION(SORT_TYPE *dst, const size_t left,
        const size_t right, const size_t pivot)
{
    SORT_TYPE value = dst[pivot];
    size_t index = left;
    size_t i;
    int not_all_same = 0;
    /* move the pivot to the right */
    SORT_SWAP(dst[pivot], dst[right]);

    for (i = left; i < right; i++) {
        int cmp = SORT_CMP(dst[i], value);
        /* check if everything is all the same */
        not_all_same |= cmp;

        if (cmp < 0) {
            SORT_SWAP(dst[i], dst[index]);
            index++;
        }
    }

    SORT_SWAP(dst[right], dst[index]);

    /* avoid degenerate case */
    if (not_all_same == 0) {
        return SIZE_MAX;
    }

    return index;
}

/* Based on Knuth vol. 3
static __inline size_t QUICK_SORT_HOARE_PARTITION(SORT_TYPE *dst, const size_t l,
    const size_t r, const size_t pivot) {
  SORT_TYPE value;
  size_t i = l + 1;
  size_t j = r;

  if (pivot != l) {
    SORT_SWAP(dst[pivot], dst[l]);
  }
  value = dst[l];

  while (1) {
    while (SORT_CMP(dst[i], value) < 0) {
      i++;
    }
    while (SORT_CMP(value, dst[j]) < 0) {
      j--;
    }
    if (j <= i) {
      SORT_SWAP(dst[l], dst[j]);
      return j;
    }
    SORT_SWAP(dst[i], dst[j]);
    i++;
    j--;
  }
  return 0;
}
*/


/* Return the median index of the objects at the three indices. */
static __inline size_t MEDIAN(const SORT_TYPE *dst, const size_t a, const size_t b,
                              const size_t c)
{
    const int AB = SORT_CMP(dst[a], dst[b]) < 0;

    if (AB) {
        /* a < b */
        const int BC = SORT_CMP(dst[b], dst[c]) < 0;

        if (BC) {
            /* a < b < c */
            return b;
        } else {
            /* a < b, c < b */
            const int AC = SORT_CMP(dst[a], dst[c]) < 0;

            if (AC) {
                /* a < c < b */
                return c;
            } else {
                /* c < a < b */
                return a;
            }
        }
    } else {
        /* b < a */
        const int AC = SORT_CMP(dst[a], dst[b]) < 0;

        if (AC) {
            /* b < a < c */
            return a;
        } else {
            /* b < a, c < a */
            const int BC = SORT_CMP(dst[b], dst[c]) < 0;

            if (BC) {
                /* b < c < a */
                return c;
            } else {
                /* c < b < a */
                return b;
            }
        }
    }
}

static void QUICK_SORT_RECURSIVE(SORT_TYPE *dst, const size_t original_left,
                                 const size_t original_right)
{
    size_t left;
    size_t right;
    size_t pivot;
    size_t new_pivot;
    size_t middle;
    int loop_count = 0;
    const int max_loops = 64 - CLZ(original_right - original_left); /* ~lg N */
    left = original_left;
    right = original_right;

    while (1) {
        if (right <= left) {
            return;
        }

        if ((right - left + 1U) <= SMALL_SORT_BND) {
            SMALL_SORT(&dst[left], right - left + 1U);
            return;
        }

        if (++loop_count >= max_loops) {
            /* we have recursed / looped too many times; switch to heap sort */
            HEAP_SORT(&dst[left], right - left + 1U);
            return;
        }

        /* median of 5 */
        middle = left + ((right - left) >> 1);
        pivot = MEDIAN((const SORT_TYPE *) dst, left, middle, right);
        pivot = MEDIAN((const SORT_TYPE *) dst, left + ((middle - left) >> 1), pivot,
                       middle + ((right - middle) >> 1));
        new_pivot = QUICK_SORT_PARTITION(dst, left, right, pivot);

        /* check for partition all equal */
        if (new_pivot == SIZE_MAX) {
            return;
        }

        /* recurse only on the small part to avoid degenerate stack sizes */
        /* and manually do tail call on the large part */
        if (new_pivot - 1U - left > right - new_pivot - 1U) {
            /* left is bigger than right */
            QUICK_SORT_RECURSIVE(dst, new_pivot + 1U, right);
            /* tail call for left */
            right = new_pivot - 1U;
        } else {
            /* right is bigger than left */
            QUICK_SORT_RECURSIVE(dst, left, new_pivot - 1U);
            /* tail call for right */
            left = new_pivot + 1U;
        }
    }
}

void QUICK_SORT(SORT_TYPE *dst, const size_t size)
{
    /* don't bother sorting an array of size 1 */
    if (size <= 1) {
        return;
    }

    QUICK_SORT_RECURSIVE(dst, 0U, size - 1U);
}


/* timsort implementation, based on timsort.txt */

static __inline void REVERSE_ELEMENTS(SORT_TYPE *dst, size_t start, size_t end)
{
    while (1) {
        if (start >= end) {
            return;
        }

        SORT_SWAP(dst[start], dst[end]);
        start++;
        end--;
    }
}

static size_t COUNT_RUN(SORT_TYPE *dst, const size_t start, const size_t size)
{
    size_t curr;

    if (size - start == 1) {
        return 1;
    }

    if (start >= size - 2) {
        if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) {
            SORT_SWAP(dst[size - 2], dst[size - 1]);
        }

        return 2;
    }

    curr = start + 2;

    if (SORT_CMP(dst[start], dst[start + 1]) <= 0) {
        /* increasing run */
        while (1) {
            if (curr == size - 1) {
                break;
            }

            if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) {
                break;
            }

            curr++;
        }

        return curr - start;
    } else {
        /* decreasing run */
        while (1) {
            if (curr == size - 1) {
                break;
            }

            if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) {
                break;
            }

            curr++;
        }

        /* reverse in-place */
        REVERSE_ELEMENTS(dst, start, curr - 1);
        return curr - start;
    }
}

static int CHECK_INVARIANT(TIM_SORT_RUN_T *stack, const int stack_curr)
{
    size_t A, B, C;

    if (stack_curr < 2) {
        return 1;
    }

    if (stack_curr == 2) {
        const size_t A1 = stack[stack_curr - 2].length;
        const size_t B1 = stack[stack_curr - 1].length;

        if (A1 <= B1) {
            return 0;
        }

        return 1;
    }

    A = stack[stack_curr - 3].length;
    B = stack[stack_curr - 2].length;
    C = stack[stack_curr - 1].length;

    if ((A <= B + C) || (B <= C)) {
        return 0;
    }

    return 1;
}

typedef struct {
    size_t alloc;
    SORT_TYPE *storage;
} TEMP_STORAGE_T;

static void TIM_SORT_RESIZE(TEMP_STORAGE_T *store, const size_t new_size)
{
    if ((store->storage == NULL) || (store->alloc < new_size)) {
        SORT_TYPE *tempstore = (SORT_TYPE *)realloc(store->storage, new_size * sizeof(SORT_TYPE));

        if (tempstore == NULL) {
            fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes",
                    (unsigned long)(sizeof(SORT_TYPE) * new_size));
            exit(1);
        }

        store->storage = tempstore;
        store->alloc = new_size;
    }
}


static size_t TIM_SORT_GALLOP(SORT_TYPE *dst, const size_t size, const SORT_TYPE key, size_t anchor,
                              int right)
{
    int last_ofs = 0;
    int ofs, max_ofs, ofs_sign, cmp;
    size_t  l, c, r;
    cmp = SORT_CMP(key, dst[anchor]);

    if (cmp < 0 || (!right && cmp == 0)) {
        /* short cut */
        if (anchor == 0) {
            return 0;
        }

        ofs = -1;
        ofs_sign = -1;
        max_ofs = -(int)anchor; /* ensure anchor+max_ofs is valid idx */
    } else {
        if (anchor == size - 1) {
            return size;
        }

        ofs = 1;
        ofs_sign = 1;
        max_ofs = (int)(size - anchor - 1);
    }

    for (;;) {
        /* deal with overflow */
        if (max_ofs / ofs <= 1) {
            ofs = max_ofs;

            if (ofs < 0) {
                cmp = SORT_CMP(key, dst[0]);

                if ((right && cmp < 0) || (!right && cmp <= 0)) {
                    return 0;
                }
            } else {
                cmp = SORT_CMP(dst[size - 1], key);

                if ((right && cmp <= 0) || (!right && cmp < 0)) {
                    return size;
                }
            }

            break;
        }

        c = anchor + ofs;
        /* right, 0<ofs:  dst[anchor+last_ofs] <=  key  <  dst[anchor+ofs]      */
        /* left,  0<ofs:  dst[anchor+last_ofs] <   key  <= dst[anchor+ofs]      */
        /* right, ofs<0:  dst[anchor+ofs]      <=  key  <  dst[anchor+last_ofs] */
        /* left,  ofs<0:  dst[anchor+ofs]      <   key  <= dst[anchor+last_ofs] */
        cmp = SORT_CMP(key, dst[c]);

        if (0 < ofs) {
            if ((right && cmp < 0) || (!right && cmp <= 0)) {
                break;
            }
        } else {
            if ((right && 0 <= cmp) || (!right && 0 < cmp)) {
                break;
            }
        }

        last_ofs = ofs;
        ofs = (ofs << 1) + ofs_sign;
    }

    /* key in region (l, r) , both l and r have already been compared */
    if (ofs < 0) {
        l = anchor + ofs;
        r = anchor + last_ofs;
    } else {
        l = anchor + last_ofs;
        r = anchor + ofs;
    }

    while (1 < r - l) {
        c = l + ((r - l) >> 1);
        cmp = SORT_CMP(key, dst[c]);

        if ((right && cmp < 0) || (!right && cmp <= 0)) {
            r = c;
        } else {
            l = c;
        }
    }

    return r;
}


static void TIM_SORT_MERGE_LEFT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B,
                                SORT_TYPE* storage, int *min_gallop_p)
{
    size_t pdst, pa, pb, k;
    int a_count, b_count;
    int min_gallop = *min_gallop_p;
    SORT_TYPE *dst = A_src;
    SORT_TYPE_CPY(storage, dst, A);
    A_src = storage;
    pdst = pa = pb = 0;
    /* first element must in B, otherwise skipped in the caller  */
    dst[pdst++] = B_src[pb++];

    if (B == 1) {
        goto copyA;
    }

    for (;;) {
        a_count = b_count = 0;

        for (;;) {
            if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) {
                dst[pdst++] = A_src[pa++];
                ++a_count;
                b_count = 0;

                /* No need to check if pa == A because the last element must be in A
                 * so pb will reach to B first. You can check pa == A-1 and do
                 * some optimization if you wish.*/
                if (min_gallop <= a_count) {
                    break;
                }
            } else {
                dst[pdst++] = B_src[pb++];
                ++b_count;
                a_count = 0;

                if (pb == B) {
                    goto copyA;
                }

                if (min_gallop <= b_count) {
                    break;
                }
            }
        }

        ++min_gallop;

        for (;;) {
            if (min_gallop != 0) {
                min_gallop --;
            }

            k = TIM_SORT_GALLOP(&A_src[pa], A - pa, B_src[pb], 0, 1);
            SORT_TYPE_CPY(&dst[pdst], &A_src[pa], k);
            pdst += k;
            pa += k;
            /* now we know the next must be in B */
            dst[pdst++] = B_src[pb++];

            if (pb == B) {
                goto copyA;
            }

            if (a_count && k < TIM_SORT_MIN_GALLOP) {
                ++min_gallop;
                break;
            }

            k = TIM_SORT_GALLOP(&B_src[pb], B - pb, A_src[pa], 0, 0);
            SORT_TYPE_MOVE(&dst[pdst], &B_src[pb], k);
            pdst += k;
            pb += k;

            if (pb == B) {
                goto copyA;
            }

            dst[pdst++] = A_src[pa++];

            if (b_count && k < TIM_SORT_MIN_GALLOP) {
                ++min_gallop;
                break;
            }
        }
    }

copyA:
    SORT_TYPE_CPY(&dst[pdst], &A_src[pa], A - pa);
    *min_gallop_p = min_gallop;
    return;
}


static void TIM_SORT_MERGE_RIGHT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B,
                                 SORT_TYPE* storage, int *min_gallop_p)
{
    size_t k;
    int pdst, pa, pb, a_count, b_count;
    int min_gallop = *min_gallop_p;
    SORT_TYPE *dst = A_src;
    pa = (int)(A - 1);
    pb = (int)(B - 1);
    pdst = (int)(A + B - 1);
    SORT_TYPE_CPY(storage, B_src, B);
    B_src = storage;
    /* last element must in A, otherwise skipped in the caller  */
    dst[pdst--] = A_src[pa--];

    if (A == 1) {
        goto copyB;
    }

    for (;;) {
        a_count = b_count = 0;

        for (;;) {
            if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) {
                dst[pdst--] = B_src[pb--];
                ++b_count;
                a_count = 0;

                if (min_gallop <= b_count) {
                    break;
                }

                /* No need to check if pb == -1 because the first element must be in B
                 * so pa will reach to -1 first. You can check pb == 0 and do
                 * some optimization if you wish.*/
            } else {
                dst[pdst--] = A_src[pa--];
                ++a_count;
                b_count = 0;

                if (pa == -1) {
                    goto copyB;
                }

                if (min_gallop <= a_count) {
                    break;
                }
            }
        }

        ++min_gallop;

        for (;;) {
            if (min_gallop != 0) {
                min_gallop --;
            }

            k = TIM_SORT_GALLOP(A_src, pa + 1, B_src[pb], pa, 1);
            /* Understand the margin by considering k==0 */
            SORT_TYPE_MOVE(&dst[pb + k + 1], &A_src[k], pa + 1 - k);
            pdst = pb + (int)k;
            pa = (int)(k - 1);

            if (pa == -1) {
                goto copyB;
            }

            /* now we know the next must be in B */
            dst[pdst--] = B_src[pb--];

            if (a_count && pa + 1 - k < TIM_SORT_MIN_GALLOP) {
                ++min_gallop;
                break;
            }

            k = TIM_SORT_GALLOP(B_src, pb + 1, A_src[pa], pb, 0);
            SORT_TYPE_CPY(&dst[pa + k + 1], &B_src[k], pb + 1 - k);
            pdst = pa + (int)k;
            pb = (int)(k - 1);
            dst[pdst--] = A_src[pa--];

            if (pa == -1) {
                goto copyB;
            }

            if (b_count && pb + 1 - k < TIM_SORT_MIN_GALLOP) {
                ++min_gallop;
                break;
            }
        }
    }

copyB:
    SORT_TYPE_CPY(dst, B_src, pb + 1);
    *min_gallop_p = min_gallop;
    return;
}


static void TIM_SORT_MERGE(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr,
                           TEMP_STORAGE_T *store, int* min_gallop_p)
{
    size_t A = stack[stack_curr - 2].length;
    size_t B = stack[stack_curr - 1].length;
    size_t A_start = stack[stack_curr - 2].start;
    size_t B_start = stack[stack_curr - 1].start;
    SORT_TYPE *storage;
    size_t k;
    /* A[k-1] <= B[0] < A[k] */
    k = TIM_SORT_GALLOP(&dst[A_start], A, dst[B_start], 0, 1);
    A_start += k;
    A -= k;

    if (A == 0) {
        *min_gallop_p /= 2;
        return;
    }

    /* B[k-1] < A[A-1] <= B[k] */
    k = TIM_SORT_GALLOP(&dst[B_start], B, dst[B_start - 1], B - 1, 0);
    B = k;
    TIM_SORT_RESIZE(store, MIN(A, B));
    storage = store->storage;

    if (A < B) {
        TIM_SORT_MERGE_LEFT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p);
    } else {
        TIM_SORT_MERGE_RIGHT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p);
    }
}

static int TIM_SORT_COLLAPSE(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr,
                             TEMP_STORAGE_T *store, const size_t size, int* min_gallop_p)
{
    while (1) {
        size_t A, B, C, D;
        int ABC, BCD, CD;

        /* if the stack only has one thing on it, we are done with the collapse */
        if (stack_curr <= 1) {
            break;
        }

        /* if this is the last merge, just do it */
        if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) {
            TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
            stack[0].length += stack[1].length;
            stack_curr--;
            break;
        }
        /* check if the invariant is off for a stack of 2 elements */
        else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) {
            TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
            stack[0].length += stack[1].length;
            stack_curr--;
            break;
        } else if (stack_curr == 2) {
            break;
        }

        B = stack[stack_curr - 3].length;
        C = stack[stack_curr - 2].length;
        D = stack[stack_curr - 1].length;

        if (stack_curr >= 4) {
            A = stack[stack_curr - 4].length;
            ABC = (A <= B + C);
        } else {
            ABC = 0;
        }

        BCD = (B <= C + D) || ABC;
        CD = (C <= D);

        /* Both invariants are good */
        if (!BCD && !CD) {
            break;
        }

        /* left merge */
        if (BCD && !CD) {
            TIM_SORT_MERGE(dst, stack, stack_curr - 1, store, min_gallop_p);
            stack[stack_curr - 3].length += stack[stack_curr - 2].length;
            stack[stack_curr - 2] = stack[stack_curr - 1];
            stack_curr--;
        } else {
            /* right merge */
            TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
            stack[stack_curr - 2].length += stack[stack_curr - 1].length;
            stack_curr--;
        }
    }

    return stack_curr;
}

static __inline int PUSH_NEXT(SORT_TYPE *dst,
                              const size_t size,
                              TEMP_STORAGE_T *store,
                              const size_t minrun,
                              TIM_SORT_RUN_T *run_stack,
                              size_t *stack_curr,
                              size_t *curr,
                              int *min_gallop_p)
{
    size_t len = COUNT_RUN(dst, *curr, size);
    size_t run = minrun;

    if (run > size - *curr) {
        run = size - *curr;
    }

    if (run > len) {
        BINARY_INSERTION_SORT_START(&dst[*curr], len, run);
        len = run;
    }

    run_stack[*stack_curr].start = *curr;
    run_stack[*stack_curr].length = len;
    (*stack_curr)++;
    *curr += len;

    if (*curr == size) {
        /* finish up */
        while (*stack_curr > 1) {
            TIM_SORT_MERGE(dst, run_stack, (int)*stack_curr, store, min_gallop_p);
            run_stack[*stack_curr - 2].length += run_stack[*stack_curr - 1].length;
            (*stack_curr)--;
        }

        if (store->storage != NULL) {
            free(store->storage);
            store->storage = NULL;
        }

        return 0;
    }

    return 1;
}

void TIM_SORT(SORT_TYPE *dst, const size_t size)
{
    size_t minrun;
    TEMP_STORAGE_T _store, *store;
    TIM_SORT_RUN_T run_stack[TIM_SORT_STACK_SIZE];
    size_t stack_curr = 0;
    size_t curr = 0;
    int min_gallop = TIM_SORT_MIN_GALLOP;

    /* don't bother sorting an array of size 1 */
    if (size <= 1) {
        return;
    }

    if (size < 64) {
        SMALL_SORT(dst, size);
        return;
    }

    /* compute the minimum run length */
    minrun = compute_minrun(size);
    /* temporary storage for merges */
    store = &_store;
    store->alloc = 0;
    store->storage = NULL;

    if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
        return;
    }

    if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
        return;
    }

    if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
        return;
    }

    while (1) {
        if (!CHECK_INVARIANT(run_stack, (int)stack_curr)) {
            stack_curr = TIM_SORT_COLLAPSE(dst, run_stack, (int)stack_curr, store, size, &min_gallop);
            continue;
        }

        if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
            return;
        }
    }
}

/* heap sort: based on wikipedia */

static __inline void HEAP_SIFT_DOWN(SORT_TYPE *dst, const size_t start, const size_t end)
{
    size_t root = start;

    while ((root << 1) <= end) {
        size_t child = root << 1;

        if ((child < end) && (SORT_CMP(dst[child], dst[child + 1]) < 0)) {
            child++;
        }

        if (SORT_CMP(dst[root], dst[child]) < 0) {
            SORT_SWAP(dst[root], dst[child]);
            root = child;
        } else {
            return;
        }
    }
}

static __inline void HEAPIFY(SORT_TYPE *dst, const size_t size)
{
    size_t start = size >> 1;

    while (1) {
        HEAP_SIFT_DOWN(dst, start, size - 1);

        if (start == 0) {
            break;
        }

        start--;
    }
}

void HEAP_SORT(SORT_TYPE *dst, const size_t size)
{
    size_t end = size - 1;

    /* don't bother sorting an array of size <= 1 */
    if (size <= 1) {
        return;
    }

    HEAPIFY(dst, size);

    while (end > 0) {
        SORT_SWAP(dst[end], dst[0]);
        HEAP_SIFT_DOWN(dst, 0, end - 1);
        end--;
    }
}

/********* Sqrt sorting *********************************/
/*                                                       */
/* (c) 2014 by Andrey Astrelin                           */
/*                                                       */
/*                                                       */
/* Stable sorting that works in O(N*log(N)) worst time   */
/* and uses O(sqrt(N)) extra memory                      */
/*                                                       */
/* Define SORT_TYPE and SORT_CMP                         */
/* and then call SqrtSort() function                     */
/*                                                       */
/*********************************************************/

#define SORT_CMP_A(a,b) SORT_CMP(*(a),*(b))

static __inline void SQRT_SORT_SWAP_1(SORT_TYPE *a, SORT_TYPE *b)
{
    SORT_TYPE c = *a;
    *a++ = *b;
    *b++ = c;
}

static __inline void SQRT_SORT_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n)
{
    while (n--) {
        SQRT_SORT_SWAP_1(a++, b++);
    }
}


static void SQRT_SORT_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M)
{
    int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;

    while (p1 >= 0) {
        if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
            arr[p0--] = arr[p1--];
        } else {
            arr[p0--] = arr[p2--];
        }
    }

    if (p2 != p0) while (p2 >= L1) {
            arr[p0--] = arr[p2--];
        }
}

/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
static void SQRT_SORT_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M)
{
    int p0 = 0, p1 = L1;
    L2 += L1;

    while (p1 < L2) {
        if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
            arr[M++] = arr[p1++];
        } else {
            arr[M++] = arr[p0++];
        }
    }

    if (M != p0) while (p0 < L1) {
            arr[M++] = arr[p0++];
        }
}

/* arr[0,L1-1] ++ arr2[0,L2-1] -> arr[-L1,L2-1],  arr2 is "before" arr1 */
static void SQRT_SORT_MERGE_DOWN(SORT_TYPE *arr, SORT_TYPE *arr2, int L1, int L2)
{
    int p0 = 0, p1 = 0, M = -L2;

    while (p1 < L2) {
        if (p0 == L1 || SORT_CMP_A(arr + p0, arr2 + p1) >= 0) {
            arr[M++] = arr2[p1++];
        } else {
            arr[M++] = arr[p0++];
        }
    }

    if (M != p0) while (p0 < L1) {
            arr[M++] = arr[p0++];
        }
}

static void SQRT_SORT_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
        int lkeys)
{
    int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
    int ftype = 1 - *atype; /* 1 if inverted */

    while (p1 < q1 && p2 < q2) {
        if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
            arr[p0++] = arr[p1++];
        } else {
            arr[p0++] = arr[p2++];
        }
    }

    if (p1 < q1) {
        *alen1 = q1 - p1;

        while (p1 < q1) {
            arr[--q2] = arr[--q1];
        }
    } else {
        *alen1 = q2 - p2;
        *atype = ftype;
    }
}


/*
  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
  keys - arrays of keys, in same order as blocks. key<midkey means stream A
  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
*/
static void SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(int *keys, int midkey, SORT_TYPE *arr,
        int nblock, int lblock, int nblock2, int llast)
{
    int l, prest, lrest, frest, pidx, cidx, fnext;

    if (nblock == 0) {
        l = nblock2 * lblock;
        SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
        return;
    }

    lrest = lblock;
    frest = keys[0] < midkey ? 0 : 1;
    pidx = lblock;

    for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
        prest = pidx - lrest;
        fnext = keys[cidx] < midkey ? 0 : 1;

        if (fnext == frest) {
            SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
            prest = pidx;
            lrest = lblock;
        } else {
            SQRT_SORT_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
        }
    }

    prest = pidx - lrest;

    if (llast) {
        if (frest) {
            SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
            prest = pidx;
            lrest = lblock * nblock2;
            frest = 0;
        } else {
            lrest += lblock * nblock2;
        }

        SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
    } else {
        SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
    }
}

/*
  build blocks of length K
  input: [-K,-1] elements are buffer
  output: first K elements are buffer, blocks 2*K and last subblock sorted
*/
static void SQRT_SORT_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K)
{
    int m, u, h, p0, p1, rest, restk, p;

    for (m = 1; m < L; m += 2) {
        u = 0;

        if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
            u = 1;
        }

        arr[m - 3] = arr[m - 1 + u];
        arr[m - 2] = arr[m - u];
    }

    if (L % 2) {
        arr[L - 3] = arr[L - 1];
    }

    arr -= 2;

    for (h = 2; h < K; h *= 2) {
        p0 = 0;
        p1 = L - 2 * h;

        while (p0 <= p1) {
            SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
            p0 += 2 * h;
        }

        rest = L - p0;

        if (rest > h) {
            SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
        } else {
            for (; p0 < L; p0++) {
                arr[p0 - h] = arr[p0];
            }
        }

        arr -= h;
    }

    restk = L % (2 * K);
    p = L - restk;

    if (restk <= K) {
        SORT_TYPE_CPY(arr + p + K, arr + p, restk);
    } else {
        SQRT_SORT_MERGE_RIGHT(arr + p, K, restk - K, K);
    }

    while (p > 0) {
        p -= 2 * K;
        SQRT_SORT_MERGE_RIGHT(arr + p, K, K, K);
    }
}


static void SQRT_SORT_SORT_INS(SORT_TYPE *arr, int len)
{
    int i, j;

    for (i = 1; i < len; i++) {
        for (j = i - 1; j >= 0 && SORT_CMP_A(arr + (j + 1), arr + j) < 0; j--) {
            SQRT_SORT_SWAP_1(arr + j, arr + (j + 1));
        }
    }
}

/*
  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
*/
static void SQRT_SORT_COMBINE_BLOCKS(SORT_TYPE *arr, int len, int LL, int lblock, int *tags)
{
    int M, b, NBlk, midkey, lrest, u, i, p, v, kc, nbl2, llast;
    SORT_TYPE *arr1;
    M = len / (2 * LL);
    lrest = len % (2 * LL);

    if (lrest <= LL) {
        len -= lrest;
        lrest = 0;
    }

    for (b = 0; b <= M; b++) {
        if (b == M && lrest == 0) {
            break;
        }

        arr1 = arr + b * 2 * LL;
        NBlk = (b == M ? lrest : 2 * LL) / lblock;
        u = NBlk + (b == M ? 1 : 0);

        for (i = 0; i <= u; i++) {
            tags[i] = i;
        }

        midkey = LL / lblock;

        for (u = 1; u < NBlk; u++) {
            p = u - 1;

            for (v = u; v < NBlk; v++) {
                kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);

                if (kc > 0 || (kc == 0 && tags[p] > tags[v])) {
                    p = v;
                }
            }

            if (p != u - 1) {
                SQRT_SORT_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
                i = tags[u - 1];
                tags[u - 1] = tags[p];
                tags[p] = i;
            }
        }

        nbl2 = llast = 0;

        if (b == M) {
            llast = lrest % lblock;
        }

        if (llast != 0) {
            while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
                nbl2++;
            }
        }

        SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(tags, midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
    }

    for (p = len; --p >= 0;) {
        arr[p] = arr[p - lblock];
    }
}


static void SQRT_SORT_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int *Tags)
{
    int lblock, cbuf;

    if (Len < 16) {
        SQRT_SORT_SORT_INS(arr, Len);
        return;
    }

    lblock = 1;

    while (lblock * lblock < Len) {
        lblock *= 2;
    }

    SORT_TYPE_CPY(extbuf, arr, lblock);
    SQRT_SORT_COMMON_SORT(extbuf, lblock, arr, Tags);
    SQRT_SORT_BUILD_BLOCKS(arr + lblock, Len - lblock, lblock);
    cbuf = lblock;

    while (Len > (cbuf *= 2)) {
        SQRT_SORT_COMBINE_BLOCKS(arr + lblock, Len - lblock, cbuf, lblock, Tags);
    }

    SQRT_SORT_MERGE_DOWN(arr + lblock, extbuf, Len - lblock, lblock);
}

void SQRT_SORT(SORT_TYPE *arr, size_t Len)
{
    int L = 1;
    SORT_TYPE *ExtBuf;
    int *Tags;
    int NK;

    while (L * L < Len) {
        L *= 2;
    }

    NK = (int)((Len - 1) / L + 2);
    ExtBuf = SORT_NEW_BUFFER(L);

    if (ExtBuf == NULL) {
        return;  /* fail */
    }

    Tags = (int*)malloc(NK * sizeof(int));

    if (Tags == NULL) {
        return;
    }

    SQRT_SORT_COMMON_SORT(arr, (int)Len, ExtBuf, Tags);
    free(Tags);
    SORT_DELETE_BUFFER(ExtBuf);
}

/********* Grail sorting *********************************/
/*                                                       */
/* (c) 2013 by Andrey Astrelin                           */
/*                                                       */
/*                                                       */
/* Stable sorting that works in O(N*log(N)) worst time   */
/* and uses O(1) extra memory                            */
/*                                                       */
/* Define SORT_TYPE and SORT_CMP                         */
/* and then call GrailSort() function                    */
/*                                                       */
/* For sorting with fixed external buffer (512 items)    */
/* use GrailSortWithBuffer()                             */
/*                                                       */
/* For sorting with dynamic external buffer (O(sqrt(N)) items) */
/* use GrailSortWithDynBuffer()                          */
/*                                                       */
/* Also classic in-place merge sort is implemented       */
/* under the name of RecStableSort()                     */
/*                                                       */
/*********************************************************/

#define GRAIL_EXT_BUFFER_LENGTH 512

static __inline void GRAIL_SWAP1(SORT_TYPE *a, SORT_TYPE *b)
{
    SORT_TYPE c = *a;
    *a = *b;
    *b = c;
}

static __inline void GRAIL_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n)
{
    while (n--) {
        GRAIL_SWAP1(a++, b++);
    }
}

static void GRAIL_ROTATE(SORT_TYPE *a, int l1, int l2)
{
    while (l1 && l2) {
        if (l1 <= l2) {
            GRAIL_SWAP_N(a, a + l1, l1);
            a += l1;
            l2 -= l1;
        } else {
            GRAIL_SWAP_N(a + (l1 - l2), a + l1, l2);
            l1 -= l2;
        }
    }
}

static int GRAIL_BIN_SEARCH_LEFT(SORT_TYPE *arr, int len, SORT_TYPE *key)
{
    int a = -1, b = len, c;

    while (a < b - 1) {
        c = a + ((b - a) >> 1);

        if (SORT_CMP_A(arr + c, key) >= 0) {
            b = c;
        } else {
            a = c;
        }
    }

    return b;
}
static int GRAIL_BIN_SEARCH_RIGHT(SORT_TYPE *arr, int len, SORT_TYPE *key)
{
    int a = -1, b = len, c;

    while (a < b - 1) {
        c = a + ((b - a) >> 1);

        if (SORT_CMP_A(arr + c, key) > 0) {
            b = c;
        } else {
            a = c;
        }
    }

    return b;
}

/* cost: 2*len+nk^2/2 */
static int GRAIL_FIND_KEYS(SORT_TYPE *arr, int len, int nkeys)
{
    int h = 1, h0 = 0; /* first key is always here */
    int u = 1, r;

    while (u < len && h < nkeys) {
        r = GRAIL_BIN_SEARCH_LEFT(arr + h0, h, arr + u);

        if (r == h || SORT_CMP_A(arr + u, arr + (h0 + r)) != 0) {
            GRAIL_ROTATE(arr + h0, h, u - (h0 + h));
            h0 = u - h;
            GRAIL_ROTATE(arr + (h0 + r), h - r, 1);
            h++;
        }

        u++;
    }

    GRAIL_ROTATE(arr, h0, h);
    return h;
}

/* cost: min(L1,L2)^2+max(L1,L2) */
static void GRAIL_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int len1, int len2)
{
    int h;

    if (len1 < len2) {
        while (len1) {
            h = GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr);

            if (h != 0) {
                GRAIL_ROTATE(arr, len1, h);
                arr += h;
                len2 -= h;
            }

            if (len2 == 0) {
                break;
            }

            do {
                arr++;
                len1--;
            } while (len1 && SORT_CMP_A(arr, arr + len1) <= 0);
        }
    } else {
        while (len2) {
            h = GRAIL_BIN_SEARCH_RIGHT(arr, len1, arr + (len1 + len2 - 1));

            if (h != len1) {
                GRAIL_ROTATE(arr + h, len1 - h, len2);
                len1 = h;
            }

            if (len1 == 0) {
                break;
            }

            do {
                len2--;
            } while (len2 && SORT_CMP_A(arr + len1 - 1, arr + len1 + len2 - 1) <= 0);
        }
    }
}

/* arr[M..-1] - buffer, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
static void GRAIL_MERGE_LEFT(SORT_TYPE *arr, int L1, int L2, int M)
{
    int p0 = 0, p1 = L1;
    L2 += L1;

    while (p1 < L2) {
        if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
            GRAIL_SWAP1(arr + (M++), arr + (p1++));
        } else {
            GRAIL_SWAP1(arr + (M++), arr + (p0++));
        }
    }

    if (M != p0) {
        GRAIL_SWAP_N(arr + M, arr + p0, L1 - p0);
    }
}
static void GRAIL_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M)
{
    int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;

    while (p1 >= 0) {
        if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
            GRAIL_SWAP1(arr + (p0--), arr + (p1--));
        } else {
            GRAIL_SWAP1(arr + (p0--), arr + (p2--));
        }
    }

    if (p2 != p0) while (p2 >= L1) {
            GRAIL_SWAP1(arr + (p0--), arr + (p2--));
        }
}

static void GRAIL_SMART_MERGE_WITH_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int len2,
        int lkeys)
{
    int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
    int ftype = 1 - *atype; /* 1 if inverted */

    while (p1 < q1 && p2 < q2) {
        if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
            GRAIL_SWAP1(arr + (p0++), arr + (p1++));
        } else {
            GRAIL_SWAP1(arr + (p0++), arr + (p2++));
        }
    }

    if (p1 < q1) {
        *alen1 = q1 - p1;

        while (p1 < q1) {
            GRAIL_SWAP1(arr + (--q1), arr + (--q2));
        }
    } else {
        *alen1 = q2 - p2;
        *atype = ftype;
    }
}
static void GRAIL_SMART_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int _len2)
{
    int len1, len2, ftype, h;

    if (!_len2) {
        return;
    }

    len1 = *alen1;
    len2 = _len2;
    ftype = 1 - *atype;

    if (len1 && SORT_CMP_A(arr + (len1 - 1), arr + len1) - ftype >= 0) {
        while (len1) {
            h = ftype ? GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr) : GRAIL_BIN_SEARCH_RIGHT(arr + len1, len2,
                    arr);

            if (h != 0) {
                GRAIL_ROTATE(arr, len1, h);
                arr += h;
                len2 -= h;
            }

            if (len2 == 0) {
                *alen1 = len1;
                return;
            }

            do {
                arr++;
                len1--;
            } while (len1 && SORT_CMP_A(arr, arr + len1) - ftype < 0);
        }
    }

    *alen1 = len2;
    *atype = ftype;
}

/***** Sort With Extra Buffer *****/

/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
static void GRAIL_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M)
{
    int p0 = 0, p1 = L1;
    L2 += L1;

    while (p1 < L2) {
        if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
            arr[M++] = arr[p1++];
        } else {
            arr[M++] = arr[p0++];
        }
    }

    if (M != p0) while (p0 < L1) {
            arr[M++] = arr[p0++];
        }
}

static void GRAIL_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
        int lkeys)
{
    int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
    int ftype = 1 - *atype; /* 1 if inverted */

    while (p1 < q1 && p2 < q2) {
        if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
            arr[p0++] = arr[p1++];
        } else {
            arr[p0++] = arr[p2++];
        }
    }

    if (p1 < q1) {
        *alen1 = q1 - p1;

        while (p1 < q1) {
            arr[--q2] = arr[--q1];
        }
    } else {
        *alen1 = q2 - p2;
        *atype = ftype;
    }
}

/*
  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
  keys - arrays of keys, in same order as blocks. key<midkey means stream A
  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
*/
static void GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr,
        int nblock, int lblock, int nblock2, int llast)
{
    int l, prest, lrest, frest, pidx, cidx, fnext;

    if (nblock == 0) {
        l = nblock2 * lblock;
        GRAIL_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
        return;
    }

    lrest = lblock;
    frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
    pidx = lblock;

    for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
        prest = pidx - lrest;
        fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;

        if (fnext == frest) {
            SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
            prest = pidx;
            lrest = lblock;
        } else {
            GRAIL_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
        }
    }

    prest = pidx - lrest;

    if (llast) {
        if (frest) {
            SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
            prest = pidx;
            lrest = lblock * nblock2;
            frest = 0;
        } else {
            lrest += lblock * nblock2;
        }

        GRAIL_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
    } else {
        SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
    }
}

/***** End Sort With Extra Buffer *****/

/*
  build blocks of length K
  input: [-K,-1] elements are buffer
  output: first K elements are buffer, blocks 2*K and last subblock sorted
*/
static void GRAIL_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K, SORT_TYPE *extbuf, int LExtBuf)
{
    int m, u, h, p0, p1, rest, restk, p, kbuf;
    kbuf = K < LExtBuf ? K : LExtBuf;

    while (kbuf & (kbuf - 1)) {
        kbuf &= kbuf - 1;  /* max power or 2 - just in case */
    }

    if (kbuf) {
        SORT_TYPE_CPY(extbuf, arr - kbuf, kbuf);

        for (m = 1; m < L; m += 2) {
            u = 0;

            if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
                u = 1;
            }

            arr[m - 3] = arr[m - 1 + u];
            arr[m - 2] = arr[m - u];
        }

        if (L % 2) {
            arr[L - 3] = arr[L - 1];
        }

        arr -= 2;

        for (h = 2; h < kbuf; h *= 2) {
            p0 = 0;
            p1 = L - 2 * h;

            while (p0 <= p1) {
                GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
                p0 += 2 * h;
            }

            rest = L - p0;

            if (rest > h) {
                GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
            } else {
                for (; p0 < L; p0++) {
                    arr[p0 - h] = arr[p0];
                }
            }

            arr -= h;
        }

        SORT_TYPE_CPY(arr + L, extbuf, kbuf);
    } else {
        for (m = 1; m < L; m += 2) {
            u = 0;

            if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
                u = 1;
            }

            GRAIL_SWAP1(arr + (m - 3), arr + (m - 1 + u));
            GRAIL_SWAP1(arr + (m - 2), arr + (m - u));
        }

        if (L % 2) {
            GRAIL_SWAP1(arr + (L - 1), arr + (L - 3));
        }

        arr -= 2;
        h = 2;
    }

    for (; h < K; h *= 2) {
        p0 = 0;
        p1 = L - 2 * h;

        while (p0 <= p1) {
            GRAIL_MERGE_LEFT(arr + p0, h, h, -h);
            p0 += 2 * h;
        }

        rest = L - p0;

        if (rest > h) {
            GRAIL_MERGE_LEFT(arr + p0, h, rest - h, -h);
        } else {
            GRAIL_ROTATE(arr + p0 - h, h, rest);
        }

        arr -= h;
    }

    restk = L % (2 * K);
    p = L - restk;

    if (restk <= K) {
        GRAIL_ROTATE(arr + p, restk, K);
    } else {
        GRAIL_MERGE_RIGHT(arr + p, K, restk - K, K);
    }

    while (p > 0) {
        p -= 2 * K;
        GRAIL_MERGE_RIGHT(arr + p, K, K, K);
    }
}

/*
  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
  keys - arrays of keys, in same order as blocks. key<midkey means stream A
  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
*/
static void GRAIL_MERGE_BUFFERS_LEFT(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr, int nblock,
                                     int lblock, int havebuf, int nblock2, int llast)
{
    int l, prest, lrest, frest, pidx, cidx, fnext;

    if (nblock == 0) {
        l = nblock2 * lblock;

        if (havebuf) {
            GRAIL_MERGE_LEFT(arr, l, llast, -lblock);
        } else {
            GRAIL_MERGE_WITHOUT_BUFFER(arr, l, llast);
        }

        return;
    }

    lrest = lblock;
    frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
    pidx = lblock;

    for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
        prest = pidx - lrest;
        fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;

        if (fnext == frest) {
            if (havebuf) {
                GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
            }

            prest = pidx;
            lrest = lblock;
        } else {
            if (havebuf) {
                GRAIL_SMART_MERGE_WITH_BUFFER(arr + prest, &lrest, &frest, lblock, lblock);
            } else {
                GRAIL_SMART_MERGE_WITHOUT_BUFFER(arr + prest, &lrest, &frest, lblock);
            }
        }
    }

    prest = pidx - lrest;

    if (llast) {
        if (frest) {
            if (havebuf) {
                GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
            }

            prest = pidx;
            lrest = lblock * nblock2;
            frest = 0;
        } else {
            lrest += lblock * nblock2;
        }

        if (havebuf) {
            GRAIL_MERGE_LEFT(arr + prest, lrest, llast, -lblock);
        } else {
            GRAIL_MERGE_WITHOUT_BUFFER(arr + prest, lrest, llast);
        }
    } else {
        if (havebuf) {
            GRAIL_SWAP_N(arr + prest, arr + (prest - lblock), lrest);
        }
    }
}

static void GRAIL_LAZY_STABLE_SORT(SORT_TYPE *arr, int L)
{
    int m, h, p0, p1, rest;

    for (m = 1; m < L; m += 2) {
        if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
            GRAIL_SWAP1(arr + (m - 1), arr + m);
        }
    }

    for (h = 2; h < L; h *= 2) {
        p0 = 0;
        p1 = L - 2 * h;

        while (p0 <= p1) {
            GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, h);
            p0 += 2 * h;
        }

        rest = L - p0;

        if (rest > h) {
            GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, rest - h);
        }
    }
}

/*
  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
*/
static void GRAIL_COMBINE_BLOCKS(SORT_TYPE *keys, SORT_TYPE *arr, int len, int LL, int lblock,
                                 int havebuf, SORT_TYPE *xbuf)
{
    int M, b, NBlk, midkey, lrest, u, p, v, kc, nbl2, llast;
    SORT_TYPE *arr1;
    M = len / (2 * LL);
    lrest = len % (2 * LL);

    if (lrest <= LL) {
        len -= lrest;
        lrest = 0;
    }

    if (xbuf) {
        SORT_TYPE_CPY(xbuf, arr - lblock, lblock);
    }

    for (b = 0; b <= M; b++) {
        if (b == M && lrest == 0) {
            break;
        }

        arr1 = arr + b * 2 * LL;
        NBlk = (b == M ? lrest : 2 * LL) / lblock;
        SMALL_SORT(keys, NBlk + (b == M ? 1 : 0));
        midkey = LL / lblock;

        for (u = 1; u < NBlk; u++) {
            p = u - 1;

            for (v = u; v < NBlk; v++) {
                kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);

                if (kc > 0 || (kc == 0 && SORT_CMP_A(keys + p, keys + v) > 0)) {
                    p = v;
                }
            }

            if (p != u - 1) {
                GRAIL_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
                GRAIL_SWAP1(keys + (u - 1), keys + p);

                if (midkey == u - 1 || midkey == p) {
                    midkey ^= (u - 1)^p;
                }
            }
        }

        nbl2 = llast = 0;

        if (b == M) {
            llast = lrest % lblock;
        }

        if (llast != 0) {
            while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
                nbl2++;
            }
        }

        if (xbuf) {
            GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(keys, keys + midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
        } else {
            GRAIL_MERGE_BUFFERS_LEFT(keys, keys + midkey, arr1, NBlk - nbl2, lblock, havebuf, nbl2, llast);
        }
    }

    if (xbuf) {
        for (p = len; --p >= 0;) {
            arr[p] = arr[p - lblock];
        }

        SORT_TYPE_CPY(arr - lblock, xbuf, lblock);
    } else if (havebuf) {
        while (--len >= 0) {
            GRAIL_SWAP1(arr + len, arr + len - lblock);
        }
    }
}


static void GRAIL_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int LExtBuf)
{
    int lblock, nkeys, findkeys, ptr, cbuf, lb, nk;
    int havebuf, chavebuf;
    long long s;

    if (Len <= SMALL_SORT_BND) {
        SMALL_SORT(arr, Len);
        return;
    }

    lblock = 1;

    while (lblock * lblock < Len) {
        lblock *= 2;
    }

    nkeys = (Len - 1) / lblock + 1;
    findkeys = GRAIL_FIND_KEYS(arr, Len, nkeys + lblock);
    havebuf = 1;

    if (findkeys < nkeys + lblock) {
        if (findkeys < 4) {
            GRAIL_LAZY_STABLE_SORT(arr, Len);
            return;
        }

        nkeys = lblock;

        while (nkeys > findkeys) {
            nkeys /= 2;
        }

        havebuf = 0;
        lblock = 0;
    }

    ptr = lblock + nkeys;
    cbuf = havebuf ? lblock : nkeys;

    if (havebuf) {
        GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, extbuf, LExtBuf);
    } else {
        GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, NULL, 0);
    }

    /* 2*cbuf are built */
    while (Len - ptr > (cbuf *= 2)) {
        lb = lblock;
        chavebuf = havebuf;

        if (!havebuf) {
            if (nkeys > 4 && nkeys / 8 * nkeys >= cbuf) {
                lb = nkeys / 2;
                chavebuf = 1;
            } else {
                nk = 1;
                s = (long long)cbuf * findkeys / 2;

                while (nk < nkeys && s != 0) {
                    nk *= 2;
                    s /= 8;
                }

                lb = (2 * cbuf) / nk;
            }
        }

        GRAIL_COMBINE_BLOCKS(arr, arr + ptr, Len - ptr, cbuf, lb, chavebuf, chavebuf
                             && lb <= LExtBuf ? extbuf : NULL);
    }

    SMALL_SORT(arr, ptr);
    GRAIL_MERGE_WITHOUT_BUFFER(arr, ptr, Len - ptr);
}

void GRAIL_SORT(SORT_TYPE *arr, size_t Len)
{
    GRAIL_COMMON_SORT(arr, (int)Len, NULL, 0);
}

void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *arr, size_t Len)
{
    SORT_TYPE ExtBuf[GRAIL_EXT_BUFFER_LENGTH];
    GRAIL_COMMON_SORT(arr, (int)Len, ExtBuf, GRAIL_EXT_BUFFER_LENGTH);
}

void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *arr, size_t Len)
{
    int L = 1;
    SORT_TYPE *ExtBuf;

    while (L * L < Len) {
        L *= 2;
    }

    ExtBuf = SORT_NEW_BUFFER(L);

    if (ExtBuf == NULL) {
        GRAIL_SORT_FIXED_BUFFER(arr, Len);
    } else {
        GRAIL_COMMON_SORT(arr, (int)Len, ExtBuf, L);
        SORT_DELETE_BUFFER(ExtBuf);
    }
}

/****** classic MergeInPlace *************/

static void GRAIL_REC_MERGE(SORT_TYPE *A, int L1, int L2)
{
    int K, k1, k2, m1, m2;

    if (L1 < 3 || L2 < 3) {
        GRAIL_MERGE_WITHOUT_BUFFER(A, L1, L2);
        return;
    }

    if (L1 < L2) {
        K = L1 + L2 / 2;
    } else {
        K = L1 / 2;
    }

    k1 = k2 = GRAIL_BIN_SEARCH_LEFT(A, L1, A + K);

    if (k2 < L1 && SORT_CMP_A(A + k2, A + K) == 0) {
        k2 = GRAIL_BIN_SEARCH_RIGHT(A + k1, L1 - k1, A + K) + k1;
    }

    m1 = GRAIL_BIN_SEARCH_LEFT(A + L1, L2, A + K);
    m2 = m1;

    if (m2 < L2 && SORT_CMP_A(A + L1 + m2, A + K) == 0) {
        m2 = GRAIL_BIN_SEARCH_RIGHT(A + L1 + m1, L2 - m1, A + K) + m1;
    }

    if (k1 == k2) {
        GRAIL_ROTATE(A + k2, L1 - k2, m2);
    } else {
        GRAIL_ROTATE(A + k1, L1 - k1, m1);

        if (m2 != m1) {
            GRAIL_ROTATE(A + (k2 + m1), L1 - k2, m2 - m1);
        }
    }

    GRAIL_REC_MERGE(A + (k2 + m2), L1 - k2, L2 - m2);
    GRAIL_REC_MERGE(A, k1, m1);
}

void REC_STABLE_SORT(SORT_TYPE *arr, size_t L)
{
    int m, h, p0, p1, rest;

    for (m = 1; m < L; m += 2) {
        if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
            GRAIL_SWAP1(arr + (m - 1), arr + m);
        }
    }

    for (h = 2; h < L; h *= 2) {
        p0 = 0;
        p1 = (int)(L - 2 * h);

        while (p0 <= p1) {
            GRAIL_REC_MERGE(arr + p0, h, h);
            p0 += 2 * h;
        }

        rest = (int)(L - p0);

        if (rest > h) {
            GRAIL_REC_MERGE(arr + p0, h, rest - h);
        }
    }
}

/* Bubble sort implementation based on Wikipedia article
   https://en.wikipedia.org/wiki/Bubble_sort
*/
void BUBBLE_SORT(SORT_TYPE *dst, const size_t size)
{
    size_t n = size;

    while (n) {
        size_t i, newn = 0U;

        for (i = 1U; i < n; ++i) {
            if (SORT_CMP(dst[i - 1U], dst[i]) > 0) {
                SORT_SWAP(dst[i - 1U], dst[i]);
                newn = i;
            }
        }

        n = newn;
    }
}

#undef SORT_SAFE_CPY
#undef SORT_TYPE_CPY
#undef SORT_TYPE_MOVE
#undef SORT_NEW_BUFFER
#undef SORT_DELETE_BUFFER
#undef QUICK_SORT
#undef MEDIAN
#undef SORT_CONCAT
#undef SORT_MAKE_STR1
#undef SORT_MAKE_STR
#undef SORT_NAME
#undef SORT_TYPE
#undef SORT_CMP
#undef TEMP_STORAGE_T
#undef TIM_SORT_RUN_T
#undef PUSH_NEXT
#undef SORT_SWAP
#undef SORT_CONCAT
#undef SORT_MAKE_STR1
#undef SORT_MAKE_STR
#undef BINARY_INSERTION_FIND
#undef BINARY_INSERTION_SORT_START
#undef BINARY_INSERTION_SORT
#undef REVERSE_ELEMENTS
#undef COUNT_RUN
#undef TIM_SORT
#undef TIM_SORT_RESIZE
#undef TIM_SORT_COLLAPSE
#undef TIM_SORT_RUN_T
#undef TEMP_STORAGE_T
#undef MERGE_SORT
#undef MERGE_SORT_RECURSIVE
#undef MERGE_SORT_IN_PLACE
#undef MERGE_SORT_IN_PLACE_RMERGE
#undef MERGE_SORT_IN_PLACE_BACKMERGE
#undef MERGE_SORT_IN_PLACE_FRONTMERGE
#undef MERGE_SORT_IN_PLACE_ASWAP
#undef GRAIL_SWAP1
#undef REC_STABLE_SORT
#undef GRAIL_REC_MERGE
#undef GRAIL_SORT_DYN_BUFFER
#undef GRAIL_SORT_FIXED_BUFFER
#undef GRAIL_COMMON_SORT
#undef GRAIL_SORT
#undef GRAIL_COMBINE_BLOCKS
#undef GRAIL_LAZY_STABLE_SORT
#undef GRAIL_MERGE_WITHOUT_BUFFER
#undef GRAIL_ROTATE
#undef GRAIL_BIN_SEARCH_LEFT
#undef GRAIL_BUILD_BLOCKS
#undef GRAIL_FIND_KEYS
#undef GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF
#undef GRAIL_BIN_SEARCH_RIGHT
#undef GRAIL_MERGE_BUFFERS_LEFT
#undef GRAIL_SMART_MERGE_WITH_X_BUF
#undef GRAIL_MERGE_LEFT_WITH_X_BUF
#undef GRAIL_SMART_MERGE_WITHOUT_BUFFER
#undef GRAIL_SMART_MERGE_WITH_BUFFER
#undef GRAIL_MERGE_RIGHT
#undef GRAIL_MERGE_LEFT
#undef GRAIL_SWAP_N
#undef SQRT_SORT
#undef SQRT_SORT_BUILD_BLOCKS
#undef SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF
#undef SQRT_SORT_MERGE_DOWN
#undef SQRT_SORT_MERGE_LEFT_WITH_X_BUF
#undef SQRT_SORT_MERGE_RIGHT
#undef SQRT_SORT_SWAP_N
#undef SQRT_SORT_SWAP_1
#undef SQRT_SORT_SMART_MERGE_WITH_X_BUF
#undef SQRT_SORT_SORT_INS
#undef SQRT_SORT_COMBINE_BLOCKS
#undef SQRT_SORT_COMMON_SORT
#undef SORT_CMP_A
#undef BUBBLE_SORT