Coder Social home page Coder Social logo

pr-merge's Introduction

  • Kokkos: develop
  • Kokkos Kernels: feature/merge-path

Vortex

m KokkosKernels_common_cuda KokkosKernels_graph_cuda

jsrun -n1 -c 44 -g 1 -b rs -M -disable_gpu_hooks kokkos-kernels/graph/unit_test/KokkosKernels_graph_cuda --gtest_filter="*merge*"

jsrun -n1 -c 44 -g 1 -b rs -M -disable_gpu_hooks kokkos-kernels/graph/unit_test/KokkosKernels_graph_cuda --gtest_filter="*load_balance*"

jsrun -n1 -c 44 -g 1 -b rs -M -disable_gpu_hooks kokkos-kernels/common/unit_test/KokkosKernels_common_cuda --gtest_filter="*lower_bound*"

OpenMP run on Vortex

jsrun -n1 -c 44 -g 1 -b rs -M -disable_gpu_hooks kokkos-kernels/perf_test/sparse/sparse_kk_spmv_merge ~/suitesparse/kind_undirected_graph/pdb1HYS.mtx

CUDA

source ../load-env.sh
cmake .. \
-DCMAKE_CXX_COMPILER=${CC} \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_HWLOC=Off \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=ON \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=ON \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=On \
-DKokkos_ARCH_VOLTA70=On \
-DKokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=OFF \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \
-DKokkosKernels_ENABLE_GRAPH=ON

OpenMP

source ../load-env.sh
cmake .. \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ARCH_POWER9=On \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_HWLOC=Off \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=OFF \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=OFF \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_INST_LAYOUTRIGHT=ON \
-DKokkosKernels_INST_LAYOUTLEFT=ON \
-DKokkosKernels_ENABLE_TESTS=ON

OpenMP + CUDA

source ../load-env.sh
cmake .. \
-DCMAKE_CXX_COMPILER=${NVCC_WRAPPER} \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized -Wunused-local-typedefs" \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=On \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_TESTS=OFF \
-DKokkos_ARCH_POWER9=On \
-DKokkos_ARCH_VOLTA70=On \
-DKokkosKernels_ENABLE_ALL_COMPONENTS=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_PERFTESTS=ON \
-DKokkosKernels_ENABLE_BENCHMARK=ON \
-DKokkosKernels_INST_COMPLEX_FLOAT=OFF \
-DKokkosKernels_INST_DOUBLE=OFF \
-DKokkosKernels_INST_FLOAT=OFF \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=OFF \
-DKokkosKernels_INST_OFFSET_SIZE_T=OFF \
-DKokkosKernels_INST_LAYOUTRIGHT=OFF \
-DKokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=OFF \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF

kokkos-dev-2

m \
KokkosKernels_common_cuda \
KokkosKernels_common_openmp \
KokkosKernels_graph_cuda \
KokkosKernels_graph_openmp \
KokkosGraph_loadbalance_perf_test \
KokkosGraph_merge_perf_test \
sparse_kk_spmv_merge \
sparse_kk_spmv

m KokkosKernels_graph_cuda && kokkos-kernels/graph/unit_test/KokkosKernels_graph_cuda --gtest_filter="*load_balance*"

kokkos-kernels/graph/unit_test/KokkosKernels_graph_cuda --gtest_filter="*merge*"

kokkos-kernels/common/unit_test/KokkosKernels_common_cuda --gtest_filter="*bound*"

m KokkosGraph_loadbalance_perf_test && kokkos-kernels/perf_test/graph/KokkosGraph_loadbalance_perf_test

OpenMP + CUDA

source ../load-env.sh
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=${NVCC_WRAPPER} \
-DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized -Wunused-local-typedefs" \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_SERIAL=OFF \
-DKokkos_ARCH_SKX=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=On \
-DKokkos_ARCH_VOLTA70=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_ALL_COMPONENTS=ON \
-DKokkosKernels_ENABLE_PERFTESTS=ON \
-DKokkosKernels_ENABLE_BENCHMARK=ON \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF

Serial

source ../load-env.sh
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=g++ \
-DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized -Wunused-local-typedefs" \
-DKokkos_ENABLE_SERIAL=ON \
-DKokkos_ARCH_SKX=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF

Attaway

OpenMP

source ../load-env.sh
cmake .. \
-DKokkos_ENABLE_OPENMP=ON \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_HWLOC=Off \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=OFF \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=OFF \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_INST_LAYOUTRIGHT=ON \
-DKokkosKernels_INST_LAYOUTLEFT=ON \
-DKokkosKernels_ENABLE_TESTS=ON

WCIDs: FY210059: this is Sake/PEEK

Interactive:

salloc --nodes=1 --ntaskts-per-node=36 --time=4:00:00 --account=PUT_YOUR_WCID_HERE

Blake

OpenMP

source ../load-env.sh
cmake .. \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ARCH_SKX=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-g" \
-DKokkos_ENABLE_HWLOC=Off \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=OFF \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=OFF \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_INST_LAYOUTRIGHT=ON \
-DKokkosKernels_INST_LAYOUTLEFT=ON \
-DKokkosKernels_ENABLE_TESTS=ON

m sparse_kk_spmv_merge sparse_spmv KokkosKernels_graph_openmp

Interactive:

salloc -N 1 --time=02:00:00 

batch

sbatch new-spmv.sh

jobs

squeue

Caraway

Caraway has a few different kinds of nodes: sinfo parition

VEGA908 for MI100 VEGA90A for MI200 HIP+Serial

source ../load-env.sh
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=ON \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkos_ENABLE_HIP=ON \
-DKokkos_ARCH_VEGA90A=ON \
-DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF

m sparse_kk_spmv_merge sparse_spmv

Interactive:

salloc -N 1 -p MI100 --time=00:30:00 

batch

sbatch new-spmv.sh

jobs

squeue

A100 on caraway

source $HOME/spack-caraway/spack/share/spack/setup-env.sh
spack load cuda
module load cmake
export NVCC_WRAPPER=`readlink -f ../kokkos/bin/nvcc_wrapper`
cmake .. \
-DCMAKE_CXX_COMPILER=${NVCC_WRAPPER} \
-DCMAKE_BUILD_TYPE=Release \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=On \
-DKokkos_ARCH_AMPERE80=On \
-DKokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=OFF \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=ON \
-DKokkosKernels_ENABLE_ALL_COMPONENTS=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_PERFTESTS=ON \
-DKokkosKernels_ENABLE_BENCHMARK=ON

Perlmutter

CUDA

source ../load-env.sh
cmake .. \
-DCMAKE_CXX_COMPILER=${NVCC_WRAPPER} \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized -Wunused-local-typedefs" \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=On \
-DKokkos_ARCH_AMPERE80=On \
-DKokkos_ENABLE_HWLOC=OFF \
-DKokkosKernels_INST_COMPLEX_FLOAT=OFF \
-DKokkosKernels_INST_DOUBLE=OFF \
-DKokkosKernels_INST_FLOAT=OFF \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=OFF \
-DKokkosKernels_INST_OFFSET_SIZE_T=OFF \
-DKokkosKernels_INST_ORDINAL_INT=OFF \
-DKokkosKernels_INST_LAYOUTRIGHT=OFF \
-DKokkosKernels_ENABLE_ALL_COMPONENTS=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_PERFTESTS=ON \
-DKokkosKernels_ENABLE_BENCHMARK=ON \
-DKokkosKernels_INST_MEMSPACE_CUDAUVMSPACE=OFF \
-DKokkosKernels_ENABLE_TPL_CUSPARSE=ON
salloc --nodes 1 --qos interactive --time 01:00:00 --constraint gpu --gpus 4 --account=m3918_g
srun 
sqs

Crusher

salloc -A CSC465_crusher -J interactive -t 01:00:00 -p batch -N 1

VEGA90A for MI200 HIP+Serial

source ../load-env.sh
cmake .. \
-DCMAKE_CXX_COMPILER=hipcc \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=17 \
-DCMAKE_EXE_LINKER_FLAGS="-lstdc++fs" \
-DKokkos_ENABLE_HIP=ON \
-DKokkos_ARCH_VEGA90A=ON \
-DKokkos_ARCH_ZEN3=ON \
-DKokkosKernels_ENABLE_TPL_ROCSPARSE=ON \
-DKokkosKernels_ENABLE_ALL_COMPONENTS=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkosKernels_ENABLE_PERFTESTS=ON \
-DKokkosKernels_ENABLE_BENCHMARK=ON

profiling

kernels called, time

rocprof --stats --timestamp on kokkos-kernels/perf_test/sparse/sparse_kk_spmv_merge ~/csc465/proj-shared/cpearson/suitesparse/reals_med/2cubes_sphere.mtx
  gpu-agent0 : TCC_HIT[0-31] : Number of cache hits.
      block TCC has 4 counters

  gpu-agent0 : TCC_MISS[0-31] : Number of cache misses. UC reads count as misses.
      block TCC has 4 counters
gpu-agent0 : SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated)
      block SQ has 8 counters
  gpu-agent0 : SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated)
      block SQ has 8 counters

  gpu-agent0 : SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated)
      block SQ has 8 counters

  gpu-agent0 : SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)
      block SQ has 8 counters
gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances.
      TCC_HIT_sum = sum(TCC_HIT,32)

  gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances.
      TCC_MISS_sum = sum(TCC_MISS,32)
  • Glossary
    • TCC:
    • GDS: global data share, globally-shared explicitly-addressed memory

Atomics are generally classified as write requests

rzvernal

  • using CC instead of hipcc causes the compiler to crash

VEGA90A for MI250x HIP+Serial

source ../load-env.sh
cmake .. \
-DCMAKE_CXX_COMPILER=hipcc \
-DCMAKE_BUILD_TYPE=Release \
-DKokkosKernels_INST_COMPLEX_FLOAT=ON \
-DKokkosKernels_INST_DOUBLE=ON \
-DKokkosKernels_INST_FLOAT=ON \
-DKokkosKernels_INST_HALF=OFF \
-DKokkosKernels_INST_OFFSET_INT=ON \
-DKokkosKernels_INST_OFFSET_SIZE_T=ON \
-DKokkosKernels_ENABLE_TESTS=ON \
-DKokkos_ENABLE_HIP=ON \
-DKokkos_ARCH_VEGA90A=ON \
-DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF
m sparse_kk_spmv_merge sparse_spmv KokkosKernels_graph_hip
salloc -G 1
srun -n 1 -G 1 -c 2 kokkos-kernels/perf_test/sparse/sparse_kk_spmv_merge ~/suitesparse/reals_med/apache1.mtx

pr-merge's People

Contributors

cwpearson avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.