-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_mod.sh
executable file
·95 lines (73 loc) · 5.81 KB
/
run_mod.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/bin/bash
#export KMP_BLOCKTIME=1
#export KMP_SETTINGS=1
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
#export CCL_ALLREDUCE=recursive_doubling
export CCL_PROCESS_LAUNCHER=none
export CCL_ATL_TRANSPORT=ofi
export CCL_ATL_SHM=1
#export CCL_ITT_LEVEL=1
export CCL_WORKER_COUNT=1
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/x86_64-conda-linux-gnu/lib
#if turn this line on, need to use a small iteration count such as 50
#export CCL_SCHED_PROFILE=1
#for 48 core *2
#set CCL_WORKER_AFFINITY if necessary
#export CCL_WORKER_AFFINITY=10,22,34,46,58,70,82,94
#single node
#inference_all_reduce without cache
echo " default launcher, with compute, without cache, bf16"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --compute
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --compute
echo " default launcher, without compute, without cache, bf16"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768
echo " impi launcher, with compute, without cache, bf16"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --compute
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --compute
echo " impi launcher, without compute, without cache, bf16"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768
#inference_all_reduce with cache
echo " default launcher, with compute, with cache, bf16"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --compute --cache
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --compute --cache
echo " default launcher, without compute, with cache, bf16"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --cache
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --cache
echo " impi launcher, with compute, with cache, bf16"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --compute --cache
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --compute --cache
echo " impi launcher, without compute, with cache, bf16"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 16515072 --cache
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype bf16 --elements 32768 --cache
#single node
#inference_all_reduce without cache
echo " default launcher, with compute, without cache, fp32"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --compute
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --compute
echo " default launcher, without compute, without cache, fp32"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768
echo " impi launcher, with compute, without cache, fp32"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --compute
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --compute
echo " impi launcher, without compute, without cache, fp32"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768
#inference_all_reduce with cache
echo " default launcher, with compute, with cache, fp32"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --compute --cache
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --compute --cache
echo " default launcher, without compute, with cache, fp32"
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --cache
deepspeed --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --cache
echo " impi launcher, with compute, with cache, fp32"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --compute --cache
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --compute --cache
echo " impi launcher, without compute, with cache, fp32"
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 16515072 --cache
deepspeed --hostfile hostfile.txt --force_multi --launcher impi --bind_cores_to_rank ds_bench_modified.py --dtype fp32 --elements 32768 --cache
#oneccl benchmark
#mpirun -n 8 ~/oneCCL/build/examples/benchmark/benchmark -d bfloat16 -f 10240 -t 10240 -i 50