@inproceedings{sitchinava-ipdps20, author = {Kyle Berney and Nodari Sitchinava}, title = {Engineering worst-case inputs for pairwise merge sort on {GPUs}}, booktitle = {Proceedings of the 34th IEEE International Parallel and Distributed Processing Symposium}, series = {IPDPS '20}, year = {2020}, month = {5}, pages = {1133-1142}, doi = {10.1109/IPDPS47924.2020.00119}, abstract = "Currently, the fastest comparison-based sorting implementation on GPUs is implemented using a parallel pairwise merge sort algorithm (Thrust library). To achieve fast runtimes, the number of threads $t$ to sort the input of $N$ elements is fine-tuned experimentally for each generation of Nvidia GPUs in such a way that the number of elements $E = N/t$ that each thread accesses in each merging round results in a small (empirically measured) number of shared memory contentions, known as \emph{bank conflicts}, while balancing the number of global memory accesses and latency-hiding through thread oversubscription/occupancy. In this paper, we show that for every choice of $E < w$, such that $E$ and $w$ are co-prime, there exists an input permutation on which every warp of $w$ threads of the Thrust merge sort is effectively reduced to using at most $\lceil w/E \rceil$ threads due to sequentialization of shared memory accesses due to bank conflicts. Note that this matches the trivial worst-case bound on the loss of parallelism due to memory contentions for any warp accessing $wE$ contiguous shared memory locations. Our proof is constructive, i.e., we are able to automatically construct such permutation for every value of $E$. We also show in practice that such constructed inputs result in up to \texttildelow 50\% slowdown, compared to the performance on random inputs, on modern GPU hardware.", }