From c8be8496c8a15d0ede8338939a7512109b8e5e46 Mon Sep 17 00:00:00 2001
From: 3gg <3gg@shellblade.net>
Date: Wed, 27 Nov 2024 13:41:09 -0800
Subject: Initial commit.

---
 vector_sum/CMakeLists.txt | 11 +++++++++
 vector_sum/main.cu        | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 vector_sum/CMakeLists.txt
 create mode 100644 vector_sum/main.cu

(limited to 'vector_sum')

diff --git a/vector_sum/CMakeLists.txt b/vector_sum/CMakeLists.txt
new file mode 100644
index 0000000..1eea51b
--- /dev/null
+++ b/vector_sum/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.28)
+
+project(vector_sum LANGUAGES CUDA CXX)
+
+add_executable(vector_sum
+  main.cu)
+
+# -Wpedantic causes warnings due to nvcc emitting non-standard (gcc-specific)
+# host code.
+# https://stackoverflow.com/questions/31000996/warning-when-compiling-cu-with-wpedantic-style-of-line-directive-is-a-gcc-ex
+target_compile_options(vector_sum PRIVATE -Wall -Wextra -Wno-pedantic)
diff --git a/vector_sum/main.cu b/vector_sum/main.cu
new file mode 100644
index 0000000..ba2e964
--- /dev/null
+++ b/vector_sum/main.cu
@@ -0,0 +1,62 @@
+#include <cstdio>
+
+__global__ void add(int N, int* a, int* b, int* out) {
+  const int id = blockIdx.x;
+  out[id]      = a[id] + b[id];
+}
+
+int main() {
+  constexpr int N = 100;
+
+  bool success       = false;
+  int  host_array[N] = {0};
+  int* dev_arrays[3] = {nullptr};
+
+  // Allocate device arrays.
+  for (int i = 0; i < 3; ++i) {
+    if (cudaMalloc(&dev_arrays[i], N * sizeof(int)) != cudaSuccess) {
+      goto cleanup;
+    }
+  }
+
+  // Fill the host array with values 0..N-1.
+  for (int i = 0; i < N; ++i) {
+    host_array[i] = i;
+  }
+
+  // Copy the host array to each of the first two device arrays.
+  for (int i = 0; i < 2; ++i) {
+    if (cudaMemcpy(
+            dev_arrays[i], host_array, N * sizeof(int),
+            cudaMemcpyHostToDevice) != cudaSuccess) {
+      goto cleanup;
+    }
+  }
+
+  // Add the first two arrays.
+  // N blocks, 1 thread per block.
+  add<<<N, 1>>>(N, dev_arrays[0], dev_arrays[1], dev_arrays[2]);
+
+  // Copy the result from the third array to the host.
+  if (cudaMemcpy(
+          host_array, dev_arrays[2], N * sizeof(int), cudaMemcpyDeviceToHost) !=
+      cudaSuccess) {
+    goto cleanup;
+  }
+
+  // Print the result.
+  for (int i = 0; i < N; ++i) {
+    printf("%d ", host_array[i]);
+  }
+  printf("\n");
+
+  success = true;
+
+cleanup:
+  for (int i = 0; i < 3; ++i) {
+    if (dev_arrays[i] != nullptr) {
+      cudaFree(dev_arrays[i]);
+    }
+  }
+  return success ? 0 : 1;
+}
-- 
cgit v1.2.3