@inproceedings{ross2002spreadsort,
  acmid = {691537},
  author = {Ross, Steven J.},
  booktitle = {Proceedings of the International Conference on Parallel and Distributed Processing Techniques and Applications - Volume 3},
  interhash = {8d6f28a42466de9d2ea4cc9c11884d49},
  intrahash = {73dffc3dfca4ab655c7eb58521b97c52},
  isbn = {1-892512-89-0},
  numpages = {7},
  pages = {1100--1106},
  publisher = {CSREA Press},
  series = {PDPTA '02},
  title = {The Spreadsort High-performance General-case Sorting Algorithm},
  url = {http://dl.acm.org/citation.cfm?id=646441.691537},
  year = 2002
}

@article{Sintorn20081381,
  abstract = {This paper presents an algorithm for fast sorting of large lists using modern GPUs. The method achieves high speed by efficiently utilizing the parallelism of the \{GPU\} throughout the whole algorithm. Initially, GPU-based bucketsort or quicksort splits the list into enough sublists then to be sorted in parallel using merge-sort. The algorithm is of complexity n log n , and for lists of 8 M elements and using a single Geforce 8800 GTS-512, it is 2.5 times as fast as the bitonic sort algorithms, with standard complexity of n ( log n ) 2 , which for a long time was considered to be the fastest for \{GPU\} sorting. It is 6 times faster than single \{CPU\} quicksort, and 10% faster than the recent GPU-based radix sort. Finally, the algorithm is further parallelized to utilize two graphics cards, resulting in yet another 1.8 times speedup. },
  author = {Sintorn, Erik and Assarsson, Ulf},
  doi = {10.1016/j.jpdc.2008.05.012},
  interhash = {db7c140766f1848f9a6f74f693d810d8},
  intrahash = {0b72bee3ea941e13dec3f706c0f4362b},
  issn = {0743-7315},
  journal = {Journal of Parallel and Distributed Computing },
  note = {General-Purpose Processing using Graphics Processing Units },
  number = 10,
  pages = {1381 - 1388},
  title = {Fast parallel GPU-sorting using a hybrid algorithm },
  url = {http://www.sciencedirect.com/science/article/pii/S0743731508001196},
  volume = 68,
  year = 2008
}

@article{bottger2013vergleich,
  abstract = {Gegenstand dieser Arbeit ist der Vergleich der Sortierverfahren Radixsort, Mergesort und Quicksort. Alle drei Algorithmen werden zunächst ausführlich vorgestellt und ihre Funktionsweise erläutert. Dabei wird die Komplexität und das resultierende Laufzeitverhalten theoretisch betrachtet und die daraus entstehenden Vor- und Nachteile diskutiert. In einem Versuch werden zudem alle drei Algorithmen unter realen Bedingungen auf einem Computer getestet, um die theoretischen Betrachtungen zu untermauern.},
  author = {Böttger, Sebastian},
  interhash = {d4ac1b4f52d6f52958942c14957d6602},
  intrahash = {c49c9ef4dc754980aa36760133790f69},
  school = {University of Kassel},
  title = {Vergleich von Sortierverfahren},
  year = 2013
}

@article{knuth1970neumanns,
  acmid = {356581},
  address = {New York, NY, USA},
  author = {Knuth, Donald E.},
  doi = {10.1145/356580.356581},
  interhash = {fa0c2abe03cdd16d15a124e8f000a0ad},
  intrahash = {6272bbad8db11ff6524cff4572ffca31},
  issn = {0360-0300},
  issue_date = {Dec. 1970},
  journal = {ACM Comput. Surv.},
  month = dec,
  number = 4,
  numpages = {14},
  pages = {247--260},
  publisher = {ACM},
  title = {Von Neumann's First Computer Program},
  url = {http://doi.acm.org/10.1145/356580.356581},
  volume = 2,
  year = 1970
}

@article{sedgewick1978implementing,
  abstract = {This paper is a practical study of how to implement the Quicksort sorting algorithm and its best variants on real computers, including how to apply various code optimization techniques. A detailed implementation combining the most effective improvements to Quicksort is given, along with a discussion of how to implement it in assembly language. Analytic results describing the performance of the programs are summarized. A variety of special situations are considered from a practical standpoint to illustrate Quicksort's wide applicability as an internal sorting method which requires negligible extra storage.},
  acmid = {359631},
  address = {New York, NY, USA},
  author = {Sedgewick, Robert},
  doi = {10.1145/359619.359631},
  interhash = {9a4a22f5ef011caa82c52462824cf636},
  intrahash = {afad926399892172416b18c7a996f647},
  issn = {0001-0782},
  issue_date = {Oct. 1978},
  journal = {Commun. ACM},
  month = oct,
  number = 10,
  numpages = {11},
  pages = {847--857},
  publisher = {ACM},
  title = {Implementing Quicksort programs},
  url = {http://doi.acm.org/10.1145/359619.359631},
  volume = 21,
  year = 1978
}

@inproceedings{govindaraju2006gputerasort,
  abstract = {We present a novel external sorting algorithm using graphics processors (GPUs) on large databases composed of billions of records and wide keys. Our algorithm uses the data parallelism within a GPU along with task parallelism by scheduling some of the memory-intensive and compute-intensive threads on the GPU. Our new sorting architecture provides multiple memory interfaces on the same PC -- a fast and dedicated memory interface on the GPU along with the main memory interface for CPU computations. As a result, we achieve higher memory bandwidth as compared to CPU-based algorithms running on commodity PCs. Our approach takes into account the limited communication bandwidth between the CPU and the GPU, and reduces the data communication between the two processors. Our algorithm also improves the performance of disk transfers and achieves close to peak I/O performance. We have tested the performance of our algorithm on the SortBenchmark and applied it to large databases composed of a few hundred Gigabytes of data. Our results on a 3 GHz Pentium IV PC with $300 NVIDIA 7800 GT GPU indicate a significant performance improvement over optimized CPU-based algorithms on high-end PCs with 3.6 GHz Dual Xeon processors. Our implementation is able to outperform the current high-end PennySort benchmark and results in a higher performance to price ratio. Overall, our results indicate that using a GPU as a co-processor can significantly improve the performance of sorting algorithms on large databases.},
  acmid = {1142511},
  address = {New York, NY, USA},
  author = {Govindaraju, Naga and Gray, Jim and Kumar, Ritesh and Manocha, Dinesh},
  booktitle = {Proceedings of the 2006 ACM SIGMOD international conference on Management of data},
  doi = {10.1145/1142473.1142511},
  interhash = {a197d44fffee74b95d28db938cec0327},
  intrahash = {f586cfc6ac72dc364dbbeeff585ca05c},
  isbn = {1-59593-434-0},
  location = {Chicago, IL, USA},
  numpages = {12},
  pages = {325--336},
  publisher = {ACM},
  series = {SIGMOD '06},
  title = {GPUTeraSort: high performance graphics co-processor sorting for large database management},
  url = {http://doi.acm.org/10.1145/1142473.1142511},
  year = 2006
}

@article{friend1956sorting,
  acmid = {320833},
  address = {New York, NY, USA},
  author = {Friend, Edward H.},
  doi = {10.1145/320831.320833},
  interhash = {4ae23205cbe0eedcfea94aa157ec2a83},
  intrahash = {77f952657e1bcc2e08ec71ad9677a4ec},
  issn = {0004-5411},
  issue_date = {July 1956},
  journal = {J. ACM},
  month = jul,
  number = 3,
  numpages = {35},
  pages = {134--168},
  publisher = {ACM},
  title = {Sorting on Electronic Computer Systems},
  url = {http://doi.acm.org/10.1145/320831.320833},
  volume = 3,
  year = 1956
}