筛子排序(SieveSort)- 8

avx512版本的效率显然高于avx2版本,哪怕是i9-13900K具有32线程,而i9-11900K只有16线程,avx512版本的速度也能达到std::sort的两倍还多,而avx2版本,位宽不够,而且也只能8路归并,所以效率下降了很多,速度远不及std::sort。似乎也有一定的改进空间,只是我没有特别尝试。

除了SIMD,其实我们还有一个选项,就是显卡,上CUDA。

我手上只有3060Ti,太好的显卡太贵,目前没法奢望。

要把这个算法用CUDA这一套东西来写,把并行排序实现到1536个cuda kernel上面,就是这里要实现的。

原则是一样的,就是分小段并行排序。avx512版本首先是16个32位数并行排序,然后是16个线基于omp进行归并,然后以递归的方式一直向上归并到全部完成。而CUDA版本的递归实现有很大的问题。毕竟CUDA本来就具有极大的并行能力。比如CPU只有16线程,同时归并的就只能有16个块,而3060Ti相当于有1536个线程,同时排序并同时归并的能力就大得多。所以我们不用先前的递归方法,而是首先对整个数据空间进行分区,比如512一个块的进行分区,然后并行排序所有这些分区,然后按照16块一个分区进行归并。不使用递归,那就只能用迭代。而每一层迭代的分区配置要先用函数计算好。然后每一个层次进行一次线程同步,以保证这一层的归并都完成了之后,才能进行下一层的归并。这有点像是对多叉树的广度遍历的做法。

具体实现中,256单位块的排序用的是最基本的插入排序。这个数据量比较小,于是排起来也很快。

__device__ static bool sieve_sort_256(uint32_t* a/*[256]*/, size_t n, uint32_t* result) {
	for (size_t i = 1; i < n; i++) {
		uint32_t key = a[i];
		size_t j = i - 1;
		while (j >= 0 && a[j] > key) {
			a[j + 1] = a[j];
			j--;
			if (j == ~0ULL) break;
		}
		a[j + 1] = key;
	}

	for (size_t i = 0; i < n; i++) result[i] = a[i];
	return true;
}

struct partition {
	uint32_t* a;
	uint32_t* result;
	size_t n;
	size_t loops;
	size_t stride;
	size_t reminder;
	partition(uint32_t* a, uint32_t* result, size_t n, size_t loops, size_t stride, size_t reminder) {
		this->a = a;
		this->result = result;
		this->n = n;
		this->loops = loops;
		this->stride = stride;
		this->reminder = reminder;
	}
};


static void make_partitions(uint32_t* a, uint32_t* result, size_t n, int depth, std::map<int, std::vector<partition>>& partitions, int min_bits = 8, int shift = 4) {
	size_t loops = 0, stride = 0, reminder = 0;
	if (get_config(n, loops, stride, reminder, min_bits, shift))
	{
		auto f = partitions.find(depth);
		if (f == partitions.end()) {
			partitions.insert({ depth, { partition(a, result, n,loops,stride,reminder) } });
		}
		else {
			f->second.push_back(partition(a, result, n, loops, stride, reminder));
		}
		for (size_t i = 0; i < loops; i++) {
			make_partitions(a + i * stride, result + i * stride,
				(i == loops - 1 && reminder>0) ? reminder: stride, depth + 1, partitions, min_bits, shift);
		}
	}
	else {
		auto f = partitions.find(depth);
		if (f == partitions.end()) {
			partitions.insert({ depth, { partition(a,result,n,1,n,0) } });
		}
		else {
			f->second.push_back(partition(a, result, n, 1, n, 0));
		}
	}
}

分区的结构体以及分区函数如上。可见分区函数本身是递归的,但结果存储于以层次位序号的map之中。每个层次有若干分区,一般来说,层次号越大的层次越低,最低一层每一个都是小于或者等于256个数的分区,再上一层则是每16个一组的分区。可见这些分区层次每更上一层,就是下面层次分区数量的1/16。

获得了分区层次的配置之后,就可以调用核函数在每个层次上分别处理了。

__global__ static void sieve_sort_kerenl_with_config(partition* partitions, int max_depth, int depth) {
	unsigned int index =
		blockDim.x * blockIdx.x + threadIdx.x;

	partition* part = partitions + index;
	//printf("n=%lld,index=%d\n", pc->n, index);
	if (part->n <= 256) {
		sieve_sort_256(part->a, part->n, part->result);
	}
	else {
		uint32_t* destination = part->a;
		uint32_t* source = part->result;
		int delta_depth = max_depth - depth;
		bool flip = ((delta_depth & 1) == 1);
		flip = (((max_depth) & 1) == 1) ? !flip : flip;
		if (flip) {
			uint32_t* p = source;
			source = destination;
			destination = p;
		}
		sieve_collect(part->n, part->loops, part->stride, part->reminder, source, destination);
	}
}

这里要注意层次之间源和目的交换翻转的情况。尤其是总层次数量的奇偶性也影响翻转的方向。

最终,调用核函数,要注意的是,一层一层的调用,每一层都要完全同步,才能保证数据的正确。但由于涉及到和CPU的交互,可能会降低整体运行的速度。

__host__ bool sieve_sort_cuda(uint32_t* a, size_t n)
{
	//max(n)==256P (2^60)
	if (a == nullptr) 
		return false;
	else if (n <= 1)
		return true;
	else if (n == 2) {
		uint32_t a0 = *(a + 0);
		uint32_t a1 = *(a + 1);
		*(a + 0) = (a0 <= a1) ? a0 : a1;
		*(a + 1) = (a0 >= a1) ? a0 : a1;
		return true;
	}
	else {
		std::map<int, std::vector<partition>> _partitions;
		uint32_t* input = nullptr;
		uint32_t* result = nullptr;
		cudaError_t cudaStatus;
		cudaStatus = cudaSetDevice(0);
		// Choose which GPU to run on, change this on a multi-GPU system.
		cudaStatus = cudaMalloc((void**)&input, n * sizeof(uint32_t));
		cudaStatus = cudaMalloc((void**)&result, n * sizeof(uint32_t));

		if (result != nullptr && input != nullptr) {
			partition* partitions = nullptr;
			cudaStatus = cudaMemcpy(input, a, n * sizeof(uint32_t), cudaMemcpyHostToDevice);
			cudaStatus = cudaMemcpy(result, input, n * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
			//cudaStatus = cudaMemset(result, 0, n * sizeof(uint32_t));

			make_partitions(input, result, n, 0, _partitions, 8, 4);
			int max_depth = _partitions.size() - 1;
			size_t max_list_size = 0;
			for (auto& partition:_partitions) {
				size_t s = partition.second.size();
				max_list_size = s > max_list_size ? s : max_list_size;
			}
			//printf("n = %lld, max_depth=%d\n",n, max_depth);
			cudaStatus = cudaMalloc((void**)&partitions, max_list_size * sizeof(partition));

			for (int i = max_depth; i >= 0; i--) {
				std::vector<partition>& partitions_list = _partitions[i];
				size_t list_size = partitions_list.size();
				if (list_size > 0) {
					cudaStatus = cudaMemcpy(partitions, (void*)partitions_list.data(), list_size * sizeof(partition), cudaMemcpyHostToDevice);
					if (list_size <= THREAD_NUM) {
						sieve_sort_kerenl_with_config << <1, list_size >> > (partitions, max_depth, i);
					}
					else {
						dim3 grid(ceil(list_size / (double)THREAD_NUM), 1, 1);
						dim3 block(THREAD_NUM, 1, 1);
						sieve_sort_kerenl_with_config << <grid, block >> > (partitions, max_depth, i);
					}
					cudaThreadSynchronize();
				}
			}

			cudaFree(partitions);
			cudaStatus = cudaMemcpy(a, input, n * sizeof(uint32_t), cudaMemcpyDeviceToHost);

		}
		cudaFree(result);
		cudaFree(input);
	}
	return true;
}

为了提高单个256-数据块的速度,输入数据到GPU的时候,result整体从a复制数据。这样的话单个的256-数据块在完成排序的时候就不需要全体复制到result中,而是排序过程中数据分别写到a和result已经在排序函数中实现了。
 

测试结果由下图所示,只是相比较于std::sort,cuda的表现太慢了。


i=8
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:256
repeats:1
sieve sort speed:0.00404754K/s
std sort speed:  149.254K/s
t1(seive):0.247064 s
t2(std::):6.7e-06 s
ratio:0.00271185%

i=9
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:512
repeats:1
sieve sort speed:0.394836K/s
std sort speed:  70.4225K/s
t1(seive):0.0025327 s
t2(std::):1.42e-05 s
ratio:0.560666%

i=10
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:1024
repeats:1
sieve sort speed:0.281373K/s
std sort speed:  35.461K/s
t1(seive):0.003554 s
t2(std::):2.82e-05 s
ratio:0.793472%

i=11
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:2048
repeats:1
sieve sort speed:0.174407K/s
std sort speed:  16.6667K/s
t1(seive):0.0057337 s
t2(std::):6e-05 s
ratio:1.04644%

i=12
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:4096
repeats:1
sieve sort speed:0.0715262K/s
std sort speed:  7.72798K/s
t1(seive):0.0139809 s
t2(std::):0.0001294 s
ratio:0.925548%

i=13
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:8192
repeats:1
sieve sort speed:0.0498982K/s
std sort speed:  3.64431K/s
t1(seive):0.0200408 s
t2(std::):0.0002744 s
ratio:1.36921%

i=14
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:16384
repeats:1
sieve sort speed:0.0342282K/s
std sort speed:  1.51814K/s
t1(seive):0.0292157 s
t2(std::):0.0006587 s
ratio:2.25461%

i=15
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:32768
repeats:1
sieve sort speed:0.0165988K/s
std sort speed:  0.657419K/s
t1(seive):0.0602454 s
t2(std::):0.0015211 s
ratio:2.52484%

i=16
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:65536
repeats:1
sieve sort speed:0.00574567K/s
std sort speed:  0.28632K/s
t1(seive):0.174044 s
t2(std::):0.0034926 s
ratio:2.00673%

i=17
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:131072
repeats:1
sieve sort speed:0.00443665K/s
std sort speed:  0.172064K/s
t1(seive):0.225395 s
t2(std::):0.0058118 s
ratio:2.57849%

i=18
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:262144
repeats:1
sieve sort speed:0.0028736K/s
std sort speed:  0.0809206K/s
t1(seive):0.347995 s
t2(std::):0.0123578 s
ratio:3.55114%

i=19
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:524288
repeats:1
sieve sort speed:0.00125926K/s
std sort speed:  0.0378702K/s
t1(seive):0.794118 s
t2(std::):0.026406 s
ratio:3.3252%

i=20
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:1048576
repeats:1
sieve sort speed:0.00040943K/s
std sort speed:  0.0181824K/s
t1(seive):2.44242 s
t2(std::):0.0549981 s
ratio:2.25179%

i=21
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:2097152
repeats:1
sieve sort speed:0.00028619K/s
std sort speed:  0.00835527K/s
t1(seive):3.49418 s
t2(std::):0.119685 s
ratio:3.42527%

i=22
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:4194304
repeats:1
sieve sort speed:0.000180573K/s
std sort speed:  0.00402598K/s
t1(seive):5.53793 s
t2(std::):0.248387 s
ratio:4.4852%

i=23
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:8388608
repeats:1
sieve sort speed:7.59983e-05K/s
std sort speed:  0.00145767K/s
t1(seive):13.1582 s
t2(std::):0.686026 s
ratio:5.21368%

i=24
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:16777216
repeats:1
sieve sort speed:2.49338e-05K/s
std sort speed:  0.000652481K/s
t1(seive):40.1063 s
t2(std::):1.53261 s
ratio:3.82138%

i=25
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:33554432
repeats:1
sieve sort speed:1.70257e-05K/s
std sort speed:  0.000343532K/s
t1(seive):58.7348 s
t2(std::):2.91094 s
ratio:4.95606%

i=26
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:67108864
repeats:1
sieve sort speed:1.08509e-05K/s
std sort speed:  0.000159327K/s
t1(seive):92.1582 s
t2(std::):6.27641 s
ratio:6.81047%

i=27
==================================
Device ID: 0 has 38 multiprocessors.
Each multiprocessor has 48 warps (with 32 threads each).
CUDA cores per multiprocessor: 1536
samples:134217728
repeats:1
sieve sort speed:4.69437e-06K/s
std sort speed:  7.54075e-05K/s
t1(seive):213.021 s
t2(std::):13.2613 s
ratio:6.22533%

希望能找到方法对其进行改进。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:/a/895823.html

如若内容造成侵权/违法违规/事实不符,请联系我们进行投诉反馈qq邮箱809451989@qq.com,一经查实,立即删除!

相关文章

【python爬虫实战】爬取全年天气数据并做数据可视化分析!附源码

由于篇幅限制&#xff0c;无法展示完整代码&#xff0c;需要的朋友可在下方获取&#xff01;100%免费。 一、主题式网络爬虫设计方案 1. 主题式网络爬虫名称&#xff1a;天气预报爬取数据与可视化数据 2. 主题式网络爬虫爬取的内容与数据特征分析&#xff1a; - 爬取内容&am…

蜜罐技术的出现究竟影响了什么

自网络诞生以来&#xff0c;攻击威胁事件层出不穷&#xff0c;网络攻防对抗已成为信息时代背景下的无硝烟战争。然而&#xff0c;传统的网络防御技术如防火墙、入侵检测技术等都是一种敌暗我明的被动防御&#xff0c;难以有效应对攻击者随时随地发起的无处不在的攻击和威胁。蜜…

IO多路复用概述与epoll简介

一、引言 在网络编程中&#xff0c;高并发的场景下处理大量连接请求是一项挑战。传统的阻塞式IO模型会让线程在等待数据的过程中陷入停顿&#xff0c;导致系统效率低下。为了解决这个问题&#xff0c;IO多路复用应运而生。它允许一个线程同时监听多个文件描述符&#xff08;如…

Gin框架操作指南02:JSON渲染

官方文档地址&#xff08;中文&#xff09;&#xff1a;https://gin-gonic.com/zh-cn/docs/ 注&#xff1a;本教程采用工作区机制&#xff0c;所以一个项目下载了Gin框架&#xff0c;其余项目就无需重复下载&#xff0c;想了解的读者可阅读第一节&#xff1a;Gin操作指南&#…

qt creator 开发环境的安装

1.找官网 官网地址&#xff1a;Installation | Qt Creator Documentation 点 Parent Directory 继续点 Parent Directory 点 archive/ 2.下载在线安装器 点 online_ainstallers 选择在线安装器版本 选择对应版本后进入下载列表&#xff0c;根据自己的系统选择下载。 下载后…

DreamFace 4.7.1 | 图片说话,数字人

DreamFace是一款可以把静态图片变成动态视频的软件&#xff0c;操作简单&#xff0c;内置多种模板可供选择。此外&#xff0c;还支持将图片变得更清晰或者转换成卡通风格等功能&#xff0c;非常适合喜欢创意视频制作的用户。通过安装软件后&#xff0c;根据提示选择需要转换的静…

c++ pdf文件提取txt文本示例

最近抽空采用之前封装的接口将pdf文件提取出txt文本&#xff0c;顺利完成&#xff0c;界面如下所示&#xff1a; 提起的效果如下所示&#xff1a; 输出的txt文本内容如下&#xff1a; 下载链接&#xff1a;https://download.csdn.net/download/u011269801/89905548

vue中如何检测数组变化(vue基础,面试,源码级讲解)

大家有什么不明白的地方可以分享在评论区&#xff0c;大家一起探讨哦~~ &#xff08;如果对数据劫持还有所不明白的小伙伴&#xff0c;可以去看看上一篇文章哦&#xff09; 在vue2中&#xff0c;是如何对数组进行劫持的呢&#xff1f; 简单代码实现&#xff1a; 在vue2中&…

pytorh学习笔记——cifar10(三)模仿VGGNet创建卷积网络

VGG16是由牛津大学视觉几何组&#xff08;Visual Geometry Group&#xff09;提出的一种深度卷积神经网络模型。 VGGNet 探索了卷积神经网络的深度与其性能之间的关系&#xff0c;成功地构筑了 16~19 层深的卷积神经网络&#xff0c;同时拓展性又很强&#xff0c;迁移到其它图片…

反转链表 K个一组翻转链表

目录 LeetCode206 反转链表 LeetCode92 反转链表II LeetCode25 K个一组翻转链表 LeetCode206 反转链表 /*** Definition for singly-linked list.* struct ListNode {* int val;* ListNode *next;* ListNode() : val(0), next(nullptr) {}* ListNode(int x)…

poisson过程——随机模拟(Python和R实现)

Python实现 exponential()使用&#xff0c;自动poisson过程实现。 import numpy as np import matplotlib.pyplot as plt# Parameters lambda_rate 5 # rate parameter (events per time unit) T 10 # total time# Generate Poisson process times np.random.exponential(…

PCL 点云配准 Trimed-ICP算法(精配准

目录 一、概述 1.1原理 1.2实现步骤 1.3应用场景 二、代码实现 2.1关键函数 2.1.1 perform_standard_icp 函数 2.1.2 perform_trimmed_icp 函数 2.1.3 visualize_registration 函数 2.2完整代码 PCL点云算法汇总及实战案例汇总的目录地址链接&#xff1a; PCL点云算…

软件设计模式------简单工厂模式

简单工厂模式&#xff08;Simple factory Pattern&#xff09;&#xff0c;又称静态工厂方法(Static Factory Method),属于创新型模式&#xff0c;但它不属于GoF23个设计模式其一。 一、模式动机&#xff1a; 有时需要创建一些来自相同父类的类的实例。 二、定义&#xff1a…

(二十)、从宿主机访问 k8s(minikube) 发布的 redis 服务

文章目录 1、环境准备2、具体操作2.1、启动 minikube (start/stop)2.2、准备 redis-deployment.yaml2.3、执行 redis-deployment.yaml2.3.1、查看 pod 信息和日志 2.4、检查部署和服务状态2.4.1、如果需要删除 3、查看 IP 的几个命令3.1、查看IP的几个命令3.2、解读3.3、宿主机…

【C语言】数据输出格式控制

数据的输出格式修饰 常用两种&#xff1a; 整型中&#xff0c;输出数据左对齐、右对齐、占m位、不足m位前补0。浮点型中&#xff0c;默认通过四舍五入保留小数点后6位&#xff0c;通过参数设置保留小数点后n位。 #include <stdio.h> #define PI 3.14159 /* 功能&#x…

D43【python 接口自动化学习】- python基础之函数

day43 装饰器&#xff08;上&#xff09; 学习日期&#xff1a;20241020 学习目标&#xff1a;函数&#xfe63;- 56 装饰器&#xff1a;函数嵌套的定义与调用的区别 学习笔记&#xff1a; 变量作用域 变量读取顺序&#xff1a;local-》enclosed-》global-》builtin # 变量…

Spring MessageSource国际化原理

spring framework提供MessasgeSource来实现国际化。 MessageSource用法 准备properties文件&#xff0c;放在resources文件夹下面。这是默认语言和韩语的文件。 i18n/message.propertiesi18n/message_ko.properties 文件里面的内容是key-value格式,使用{0}、{1}作为变量占位…

【Next.js 项目实战系列】05-删除 Issue

原文链接 CSDN 的排版/样式可能有问题&#xff0c;去我的博客查看原文系列吧&#xff0c;觉得有用的话&#xff0c;给我的库点个star&#xff0c;关注一下吧 上一篇【Next.js 项目实战系列】04-修改 Issue 删除 Issue 添加删除 Button​ 本节代码链接 这里我们主要关注布局…

Win10 IDEA远程连接HBase

Win10 IDEA远程连接HBase Win10 IDEA连接虚拟机中的Hadoop&#xff08;HDFS&#xff09; 关闭Hadoop和Hbase 如果已经关闭不需要走这一步 cd /usr/local/hbase bin/stop-hbase.sh cd /usr/local/hadoop ./sbin/stop-dfs.sh获取虚拟机的ip 虚拟机终端输入 ip a关闭虚拟机…

VS Code开发qt项目

没整明白&#xff0c;尴尬 安装扩展 设置cmake路径 前提是已经安装了QT 报错 用msvc选windows启动&#xff0c;用mingw则选gdb启动