@inproceedings{sitchinava-ipdps20,
  author = {Kyle Berney and Nodari Sitchinava},
  title  = {Engineering worst-case inputs for pairwise merge sort on {GPUs}},
  booktitle = {Proceedings of the 34th IEEE International Parallel and Distributed Processing Symposium},
  series = {IPDPS '20},
  year = {2020},
  month = {5},
  pages = {1133-1142},
  doi = {10.1109/IPDPS47924.2020.00119},
  abstract = "Currently, the fastest comparison-based sorting implementation on GPUs is implemented using a parallel pairwise merge sort algorithm (Thrust library).  To achieve fast runtimes, the number of threads $t$ to sort the input of $N$ elements is fine-tuned experimentally for each generation of Nvidia GPUs in such a way that the number of elements $E = N/t$ that each thread accesses in each merging round results in a small (empirically measured) number of shared memory contentions, known as \emph{bank conflicts}, while balancing the number of global memory accesses and latency-hiding through thread oversubscription/occupancy.

In this paper, we show that for every choice of $E < w$, such that $E$ and $w$ are co-prime, there exists an input permutation on which every warp of $w$ threads of the Thrust merge sort is effectively reduced to using at most $\lceil w/E \rceil$ threads due to sequentialization of shared memory accesses due to bank conflicts. Note that this matches the trivial worst-case bound on the loss of parallelism due to memory contentions for any warp accessing $wE$ contiguous shared memory locations. 

Our proof is constructive, i.e., we are able to automatically construct such permutation for every value of $E$.
We also show in practice that such constructed inputs result in up to \texttildelow 50\% slowdown, compared to the performance on random inputs, on modern GPU hardware.",
}