筛子排序（SieveSort）- 8

avx512版本的效率显然高于avx2版本，哪怕是i9-13900K具有32线程，而i9-11900K只有16线程，avx512版本的速度也能达到std::sort的两倍还多，而avx2版本，位宽不够，而且也只能8路归并，所以效率下降了很多，速度远不及std::sort。似乎也有一定的改进空间，只是我没有特别尝试。

除了SIMD，其实我们还有一个选项，就是显卡，上CUDA。

我手上只有3060Ti，太好的显卡太贵，目前没法奢望。

要把这个算法用CUDA这一套东西来写，把并行排序实现到1536个cuda kernel上面，就是这里要实现的。

原则是一样的，就是分小段并行排序。avx512版本首先是16个32位数并行排序，然后是16个线基于omp进行归并，然后以递归的方式一直向上归并到全部完成。而CUDA版本的递归实现有很大的问题。毕竟CUDA本来就具有极大的并行能力。比如CPU只有16线程，同时归并的就只能有16个块，而3060Ti相当于有1536个线程，同时排序并同时归并的能力就大得多。所以我们不用先前的递归方法，而是首先对整个数据空间进行分区，比如512一个块的进行分区，然后并行排序所有这些分区，然后按照16块一个分区进行归并。不使用递归，那就只能用迭代。而每一层迭代的分区配置要先用函数计算好。然后每一个层次进行一次线程同步，以保证这一层的归并都完成了之后，才能进行下一层的归并。这有点像是对多叉树的广度遍历的做法。

具体实现中，256单位块的排序用的是最基本的插入排序。这个数据量比较小，于是排起来也很快。

__device__ static bool sieve_sort_256(uint32_t* a/*[256]*/, size_t n, uint32_t* result) {
	for (size_t i = 1; i < n; i++) {
		uint32_t key = a[i];
		size_t j = i - 1;
		while (j >= 0 && a[j] > key) {
			a[j + 1] = a[j];
			j--;
			if (j == ~0ULL) break;
		}
		a[j + 1] = key;
	}

	for (size_t i = 0; i < n; i++) result[i] = a[i];
	return true;
}


struct partition {
	uint32_t* a;
	uint32_t* result;
	size_t n;
	size_t loops;
	size_t stride;
	size_t reminder;
	partition(uint32_t* a, uint32_t* result, size_t n, size_t loops, size_t stride, size_t reminder) {
		this->a = a;
		this->result = result;
		this->n = n;
		this->loops = loops;
		this->stride = stride;
		this->reminder = reminder;
	}
};


static void make_partitions(uint32_t* a, uint32_t* result, size_t n, int depth, std::map<int, std::vector<partition>>& partitions, int min_bits = 8, int shift = 4) {
	size_t loops = 0, stride = 0, reminder = 0;
	if (get_config(n, loops, stride, reminder, min_bits, shift))
	{
		auto f = partitions.find(depth);
		if (f == partitions.end()) {
			partitions.insert({ depth, { partition(a, result, n,loops,stride,reminder) } });
		}
		else {
			f->second.push_back(partition(a, result, n, loops, stride, reminder));
		}
		for (size_t i = 0; i < loops; i++) {
			make_partitions(a + i * stride, result + i * stride,
				(i == loops - 1 && reminder>0) ? reminder: stride, depth + 1, partitions, min_bits, shift);
		}
	}
	else {
		auto f = partitions.find(depth);
		if (f == partitions.end()) {
			partitions.insert({ depth, { partition(a,result,n,1,n,0) } });
		}
		else {
			f->second.push_back(partition(a, result, n, 1, n, 0));
		}
	}
}

分区的结构体以及分区函数如上。可见分区函数本身是递归的，但结果存储于以层次位序号的map之中。每个层次有若干分区，一般来说，层次号越大的层次越低，最低一层每一个都是小于或者等于256个数的分区，再上一层则是每16个一组的分区。可见这些分区层次每更上一层，就是下面层次分区数量的1/16。

获得了分区层次的配置之后，就可以调用核函数在每个层次上分别处理了。

__global__ static void sieve_sort_kerenl_with_config(partition* partitions, int max_depth, int depth) {
	unsigned int index =
		blockDim.x * blockIdx.x + threadIdx.x;

	partition* part = partitions + index;
	//printf("n=%lld,index=%d\n", pc->n, index);
	if (part->n <= 256) {
		sieve_sort_256(part->a, part->n, part->result);
	}
	else {
		uint32_t* destination = part->a;
		uint32_t* source = part->result;
		int delta_depth = max_depth - depth;
		bool flip = ((delta_depth & 1) == 1);
		flip = (((max_depth) & 1) == 1) ? !flip : flip;
		if (flip) {
			uint32_t* p = source;
			source = destination;
			destination = p;
		}
		sieve_collect(part->n, part->loops, part->stride, part->reminder, source, destination);
	}
}

这里要注意层次之间源和目的交换翻转的情况。尤其是总层次数量的奇偶性也影响翻转的方向。

最终，调用核函数，要注意的是，一层一层的调用，每一层都要完全同步，才能保证数据的正确。但由于涉及到和CPU的交互，可能会降低整体运行的速度。

__host__ bool sieve_sort_cuda(uint32_t* a, size_t n)
{
	//max(n)==256P (2^60)
	if (a == nullptr) 
		return false;
	else if (n <= 1)
		return true;
	else if (n == 2) {
		uint32_t a0 = *(a + 0);
		uint32_t a1 = *(a + 1);
		*(a + 0) = (a0 <= a1) ? a0 : a1;
		*(a + 1) = (a0 >= a1) ? a0 : a1;
		return true;
	}
	else {
		std::map<int, std::vector<partition>> _partitions;
		uint32_t* input = nullptr;
		uint32_t* result = nullptr;
		cudaError_t cudaStatus;
		cudaStatus = cudaSetDevice(0);
		// Choose which GPU to run on, change this on a multi-GPU system.
		cudaStatus = cudaMalloc((void**)&input, n * sizeof(uint32_t));
		cudaStatus = cudaMalloc((void**)&result, n * sizeof(uint32_t));

		if (result != nullptr && input != nullptr) {
			partition* partitions = nullptr;
			cudaStatus = cudaMemcpy(input, a, n * sizeof(uint32_t), cudaMemcpyHostToDevice);
			cudaStatus = cudaMemcpy(result, input, n * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
			//cudaStatus = cudaMemset(result, 0, n * sizeof(uint32_t));

			make_partitions(input, result, n, 0, _partitions, 8, 4);
			int max_depth = _partitions.size() - 1;
			size_t max_list_size = 0;
			for (auto& partition:_partitions) {
				size_t s = partition.second.size();
				max_list_size = s > max_list_size ? s : max_list_size;
			}
			//printf("n = %lld, max_depth=%d\n",n, max_depth);
			cudaStatus = cudaMalloc((void**)&partitions, max_list_size * sizeof(partition));

			for (int i = max_depth; i >= 0; i--) {
				std::vector<partition>& partitions_list = _partitions[i];
				size_t list_size = partitions_list.size();
				if (list_size > 0) {
					cudaStatus = cudaMemcpy(partitions, (void*)partitions_list.data(), list_size * sizeof(partition), cudaMemcpyHostToDevice);
					if (list_size <= THREAD_NUM) {
						sieve_sort_kerenl_with_config << <1, list_size >> > (partitions, max_depth, i);
					}
					else {
						dim3 grid(ceil(list_size / (double)THREAD_NUM), 1, 1);
						dim3 block(THREAD_NUM, 1, 1);
						sieve_sort_kerenl_with_config << <grid, block >> > (partitions, max_depth, i);
					}
					cudaThreadSynchronize();
				}
			}

			cudaFree(partitions);
			cudaStatus = cudaMemcpy(a, input, n * sizeof(uint32_t), cudaMemcpyDeviceToHost);

		}
		cudaFree(result);
		cudaFree(input);
	}
	return true;
}

为了提高单个256-数据块的速度，输入数据到GPU的时候，result整体从a复制数据。这样的话单个的256-数据块在完成排序的时候就不需要全体复制到result中，而是排序过程中数据分别写到a和result已经在排序函数中实现了。

测试结果由下图所示，只是相比较于std::sort，cuda的表现太慢了。


i=8
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:256
repeats:1
sieve sort speed:0.00404754K/s
std sort speed:  149.254K/s
t1(seive):0.247064 s
t2(std::):6.7e-06 s
ratio:0.00271185%

i=9
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:512
repeats:1
sieve sort speed:0.394836K/s
std sort speed:  70.4225K/s
t1(seive):0.0025327 s
t2(std::):1.42e-05 s
ratio:0.560666%

i=10
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:1024
repeats:1
sieve sort speed:0.281373K/s
std sort speed:  35.461K/s
t1(seive):0.003554 s
t2(std::):2.82e-05 s
ratio:0.793472%

i=11
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:2048
repeats:1
sieve sort speed:0.174407K/s
std sort speed:  16.6667K/s
t1(seive):0.0057337 s
t2(std::):6e-05 s
ratio:1.04644%

i=12
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:4096
repeats:1
sieve sort speed:0.0715262K/s
std sort speed:  7.72798K/s
t1(seive):0.0139809 s
t2(std::):0.0001294 s
ratio:0.925548%

i=13
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:8192
repeats:1
sieve sort speed:0.0498982K/s
std sort speed:  3.64431K/s
t1(seive):0.0200408 s
t2(std::):0.0002744 s
ratio:1.36921%

i=14
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:16384
repeats:1
sieve sort speed:0.0342282K/s
std sort speed:  1.51814K/s
t1(seive):0.0292157 s
t2(std::):0.0006587 s
ratio:2.25461%

i=15
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:32768
repeats:1
sieve sort speed:0.0165988K/s
std sort speed:  0.657419K/s
t1(seive):0.0602454 s
t2(std::):0.0015211 s
ratio:2.52484%

i=16
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:65536
repeats:1
sieve sort speed:0.00574567K/s
std sort speed:  0.28632K/s
t1(seive):0.174044 s
t2(std::):0.0034926 s
ratio:2.00673%

i=17
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:131072
repeats:1
sieve sort speed:0.00443665K/s
std sort speed:  0.172064K/s
t1(seive):0.225395 s
t2(std::):0.0058118 s
ratio:2.57849%

i=18
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:262144
repeats:1
sieve sort speed:0.0028736K/s
std sort speed:  0.0809206K/s
t1(seive):0.347995 s
t2(std::):0.0123578 s
ratio:3.55114%

i=19
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:524288
repeats:1
sieve sort speed:0.00125926K/s
std sort speed:  0.0378702K/s
t1(seive):0.794118 s
t2(std::):0.026406 s
ratio:3.3252%

i=20
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:1048576
repeats:1
sieve sort speed:0.00040943K/s
std sort speed:  0.0181824K/s
t1(seive):2.44242 s
t2(std::):0.0549981 s
ratio:2.25179%

i=21
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:2097152
repeats:1
sieve sort speed:0.00028619K/s
std sort speed:  0.00835527K/s
t1(seive):3.49418 s
t2(std::):0.119685 s
ratio:3.42527%

i=22
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:4194304
repeats:1
sieve sort speed:0.000180573K/s
std sort speed:  0.00402598K/s
t1(seive):5.53793 s
t2(std::):0.248387 s
ratio:4.4852%

i=23
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:8388608
repeats:1
sieve sort speed:7.59983e-05K/s
std sort speed:  0.00145767K/s
t1(seive):13.1582 s
t2(std::):0.686026 s
ratio:5.21368%

i=24
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:16777216
repeats:1
sieve sort speed:2.49338e-05K/s
std sort speed:  0.000652481K/s
t1(seive):40.1063 s
t2(std::):1.53261 s
ratio:3.82138%

i=25
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:33554432
repeats:1
sieve sort speed:1.70257e-05K/s
std sort speed:  0.000343532K/s
t1(seive):58.7348 s
t2(std::):2.91094 s
ratio:4.95606%

i=26
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:67108864
repeats:1
sieve sort speed:1.08509e-05K/s
std sort speed:  0.000159327K/s
t1(seive):92.1582 s
t2(std::):6.27641 s
ratio:6.81047%

i=27
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:134217728
repeats:1
sieve sort speed:4.69437e-06K/s
std sort speed:  7.54075e-05K/s
t1(seive):213.021 s
t2(std::):13.2613 s
ratio:6.22533%

希望能找到方法对其进行改进。