-
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathMakefile
More file actions
171 lines (141 loc) · 4.92 KB
/
Makefile
File metadata and controls
171 lines (141 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
CXX = g++
NVCC = nvcc
HIPCC = hipcc
HIPCCFLAGS = -std=c++20 -I$(ROCM)/include -I./dsc/include/ --offload-arch=native -Wall -Wextra -Wformat \
-Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
-Wno-missing-braces -Wcast-align -fno-exceptions -fno-rtti
NVCCFLAGS = -std=c++20 -I$(CUDA)/include -I./dsc/include/ -ccbin=$(CXX) -arch=native \
-forward-unknown-opts -Wall -Wextra -Wformat -Wnoexcept \
-Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
-Wlogical-op -Wcast-align -fno-exceptions -fno-rtti
CXXFLAGS = -std=c++20 -I./dsc/include/ -Wall -Wextra -Wformat -Wnoexcept \
-Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
-Wlogical-op -Wcast-align -fno-exceptions -fno-rtti -pthread
LDFLAGS = -lm
UNAME_M := $(shell uname -m)
UNAME_S := $(shell uname -s)
ifdef DSC_GPU
# Try to detect the GPU vendor based on the available compiler
DSC_CUDA := $(shell which $(NVCC) 2>/dev/null)
DSC_HIP := $(shell which $(HIPCC) 2>/dev/null)
ifdef DSC_CUDA
ifneq ($(wildcard /opt/cuda),)
CUDA ?= /opt/cuda
else
CUDA ?= /usr/local/cuda
endif
else
# Check for HIP only if CUDA is not defined
ifdef DSC_HIP
ifneq ($(wildcard /opt/rocm),)
ROCM ?= /opt/rocm
else
ROCM ?= /usr/local/rocm
endif
endif
endif
endif
# Make sure only one GPU platform is defined
ifdef DSC_CUDA
ifdef DSC_HIP
$(error ERROR: both DSC_CUDA and DSC_HIP are defined - this is not supported)
endif
endif
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64))
# Use all available CPU extensions, x86 only
CXXFLAGS += -march=native -mtune=native
endif
ifndef DSC_LOG_LEVEL
ifdef DSC_FAST
DSC_LOG_LEVEL := 1
else
DSC_LOG_LEVEL := 0
endif
endif
CXXFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
NVCCFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
HIPCCFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
ifdef DSC_FAST
# -Ofast turns on all the unsafe math optimizations, including -ffinite-math-only this is an issue when testing
# because Inf and NaN have different meaning but will be treated as equals when using -ffinite-math-only.
# When inferencing assuming only finite numbers is correct but since it's doesn't actually hurt performance
# let's keep this flag so we can run our tests without worrying about denormal numbers.
CXXFLAGS += -Ofast -fno-finite-math-only -ffp-contract=fast -funroll-loops -flto=auto -fuse-linker-plugin
NVCCFLAGS += -O3
HIPCCFLAGS += -O3
else
CXXFLAGS += -O0 -fno-omit-frame-pointer -g
NVCCFLAGS += -O0 -fno-omit-frame-pointer -g -G
HIPCCFLAGS += -O0 -fno-omit-frame-pointer -g
endif
ifdef DSC_TRACING
CXXFLAGS += -DDSC_TRACING=1
NVCCFLAGS += -DDSC_TRACING=1
HIPCCFLAGS += -DDSC_TRACING=1
endif
# If we are not compiling the shared object and are in debug mode then run in ASAN mode
ifeq ($(MAKECMDGOALS),shared)
CXXFLAGS += -fPIC
NVCCFLAGS += -fPIC
HIPCCFLAGS += -fPIC
endif
GPU_SRCS := $(wildcard dsc/src/gpu/*.cpp)
GPU_OBJS := $(GPU_SRCS:.cpp=.o)
# Enable CUDA support
ifdef DSC_CUDA
# BF16 is supported in compute capability >= 8.0 (Ampere)
HAS_BF16_GPU := $(shell compute_major=$$(nvidia-smi --query-gpu=compute_cap --format=noheader | cut -d. -f1); \
if [ "$${compute_major}" -ge 8 ]; then echo 1; fi)
ifeq ($(HAS_BF16_GPU), 1)
NVCCFLAGS += -DDSC_BF16
CXXFLAGS += -DDSC_BF16
endif
CXXFLAGS += -I$(CUDA)/include -DDSC_CUDA
NVCCFLAGS += -x cu -DDSC_CUDA
LDFLAGS += -L$(CUDA)/lib64 -lcudart -lcublas
OBJS += $(GPU_OBJS)
$(GPU_OBJS): %.o: %.cpp
$(NVCC) $(NVCCFLAGS) -c $< -o $@
endif
# Enable HIP support
ifdef DSC_HIP
GPU_TARGETS := $(shell ${ROCM_PATH}/bin/rocm_agent_enumerator)
HAS_BF16_GPU := $(shell echo '${GPU_TARGETS}' | grep -q -E "gfx90a|gfx94[0-2]|gfx103[0-6]" && echo 1)
ifeq ($(HAS_BF16_GPU), 1)
HIPCCFLAGS += -DDSC_BF16
CXXFLAGS += -DDSC_BF16
endif
# TODO: is -D__HIP_PLATFORM_AMD__ required?
CXXFLAGS += -I$(ROCM)/include -DDSC_HIP -D__HIP_PLATFORM_AMD__
HIPCCFLAGS += -DDSC_HIP
LDFLAGS += -L$(ROCM)/lib -lamdhip64 -lrocrand -lrocblas
OBJS += $(GPU_OBJS)
$(GPU_OBJS): %.o: %.cpp
$(HIPCC) $(HIPCCFLAGS) -c $< -o $@
endif
$(info dsc build info: )
$(info OS: $(UNAME_S))
$(info ARCH: $(UNAME_M))
$(info CXX: $(shell $(CXX) --version | head -n 1))
$(info CXXFLAGS: $(CXXFLAGS))
ifdef DSC_CUDA
$(info NVCC: $(shell $(NVCC) --version | head -n 4 | tail -n 1))
$(info NVCCFLAGS: $(NVCCFLAGS))
endif
ifdef DSC_HIP
$(info HIPCC: $(shell $(HIPCC) --version | head -n 1 | tail -n 1))
$(info HIPCCFLAGS: $(HIPCCFLAGS))
endif
$(info LDFLAGS: $(LDFLAGS))
$(info )
SRCS = $(wildcard dsc/src/*.cpp)
SRCS += $(wildcard dsc/src/cpu/*.cpp)
OBJS += $(SRCS:.cpp=.o)
SHARED_LIB = python/dsc/libdsc.so
.PHONY: clean shared
clean:
rm -rf *.o *.so *.old $(OBJS) $(GPU_OBJS) $(SHARED_LIB)
shared: $(OBJS)
$(CXX) $(CXXFLAGS) -shared $(OBJS) -o $(SHARED_LIB) $(LDFLAGS)
%.o: %.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@