FAIT/nnc_example.cpp at main · JimyMa/FAIT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
//
// Created by jimyma on 1/29/23.
//

#include "ATen/Context.h"
#include "tensorexpr/functor_parallization.h"
#include "torch/csrc/jit/ir/ir.h"
#include "torch/csrc/jit/tensorexpr/analysis.h"
#include "torch/csrc/jit/tensorexpr/codegen.h"
#include "torch/csrc/jit/tensorexpr/expr.h"
#include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
#include "torch/csrc/jit/tensorexpr/loopnest.h"

using namespace torch::jit::tensorexpr;

/*
pytorch hot spot code

def func(a: List[torch.Tensor], b: torch.Tensor)
  e_list = []
  for (a in a_list) {
    c = a + b
    d = c + b
    e_list.append(d[..., 0] + scalar_0)
  }
  return e_list
*/

int main() {
  at::globalContext().lazyInitCUDA();
  std::unordered_map<const torch::jit::Value*, BufPtr> bufs_;
  std::vector<CodeGen::BufferArg> bufferArgs_;
  std::unordered_set<BufPtr> bufOutputs_;

  int N = 16l;

  // Functor Define
  // Input Define
  BufHandle a_buf("a_buf", {N, N}, kDouble);
  BufHandle b_buf("b_buf", {N, N}, kDouble);
  VarHandle scalar_0("scalar_0", kDouble);
  Tensor scalar_0_tensor(nullptr, nullptr);

  // Output Define
  Tensor c_tensor(nullptr, nullptr);
  Tensor d_tensor(nullptr, nullptr);
  Tensor e_tensor(nullptr, nullptr);

  // Compute Op Define
  c_tensor = Compute("c_buf", {LongImm::make(N), LongImm::make(N)},
                     [&](const std::vector<VarHandle>& axes) {
                       return a_buf.load(axes[0], axes[1]) +
                              b_buf.load(axes[0], axes[1]);
                     });

  d_tensor = Compute("d_buf", {LongImm::make(N), LongImm::make(N)},
                     [&](const std::vector<VarHandle>& axes) {
                       return b_buf.load(axes[0], axes[1]) +
                              c_tensor.load(axes[0], axes[1]);
                     });

  e_tensor = Compute("e_buf", {LongImm::make(N)},
                     [&](const std::vector<VarHandle>& axes) {
                       return d_tensor.load(axes[0], 0) + scalar_0;
                     });

  // Compute Op to Stmt
  auto block = alloc<Block>(std::vector<StmtPtr>({}));
  if (scalar_0_tensor.stmt()) block->append_stmt(scalar_0_tensor.stmt());
  if (c_tensor.stmt()) block->append_stmt(c_tensor.stmt());
  if (d_tensor.stmt()) block->append_stmt(d_tensor.stmt());
  if (e_tensor.stmt()) block->append_stmt(e_tensor.stmt());

  // Set Statement output
  bufOutputs_.insert(e_tensor.buf());

  // Loop Schedule
  LoopNest l(block, bufOutputs_);
  LoopNest::sanitizeNames(l.root_stmt());

  // Simplify
  l.simplify();

  // Compute Inline Begin
  l.inlineIntermediateBufs(/*allow_duplicated_work=*/true);
  l.optimizeConditionals();
  auto stmt_ = l.root_stmt();
  std::cout << "Inlined Statement... " << std::endl;
  std::cout << to_string(stmt_) << std::endl;
  // Compute Inline End

  // Functor Loop Binding
  for (auto buf : bufOutputs_) {
    std::vector<ForPtr> loops = l.getLoopStmtsFor(buf);
    if (loops.empty()) {
      // This happens when Buf is 0-dim
      continue;
    }

    ForPtr flattened = nullptr;
    LoopNest::flatten(loops, &flattened);
    assert(flattened);

    int loopLevels = -1;
    const int kDefaultLoopLevels = 2;

    loopLevels = (loopLevels > 0) ? loopLevels : kDefaultLoopLevels;

    int blockCount = -1;
    int blockSize = -1;

    ForPtr inner;
    const int kDefaultBlockSize = 512;
    blockSize = (blockSize > 0) ? blockSize : kDefaultBlockSize;
    LoopNest::splitWithMask(flattened, blockSize, &inner);
    flattened->set_gpu_block_index(0);
    inner->set_gpu_thread_index(0);
  }

  // Functor Parallelization
  // Add a new loop
  auto new_loop_axis = VarHandle("new_axis_i", kLong);
  stmt_ = alloc<For>(new_loop_axis.node(), immLike(new_loop_axis, 0),
                     LongImm::make(3).node(), stmt_);

  // Change Functor Load / Store to Parallization Load / Store
  BufHandle a_0_buf("a_0_tensor", {N, N}, kDouble);
  BufHandle a_1_buf("a_1_tensor", {N, N}, kDouble);
  BufHandle a_2_buf("a_2_tensor", {N, N}, kDouble);

  BufHandle e_0_tensor("e_0_tensor", {N, N}, kDouble);
  BufHandle e_1_tensor("e_1_tensor", {N, N}, kDouble);
  BufHandle e_2_tensor("e_2_tensor", {N, N}, kDouble);
  stmt_ = FunctorParallization::parallel_functor_load(
      stmt_, 3, new_loop_axis.node(),
      {{a_buf.node(), {a_0_buf, a_1_buf, a_2_buf}}}, {});

  stmt_ = FunctorParallization::parallel_functor_store(
      stmt_, 3, new_loop_axis.node(),
      {{e_tensor.buf(), {e_0_tensor, e_1_tensor, e_2_tensor}}});
  static_to<For>(stmt_)->set_gpu_block_index(1);
  std::cout << to_string(stmt_) << std::endl;

  // CodeGen
  l.prepareForCodegen();
  l.simplify();

  auto stmt = l.root_stmt();
  IRSimplifier::simplify(stmt);

  // Kernel Arguments
  bufferArgs_.emplace_back(a_0_buf);
  bufferArgs_.emplace_back(a_1_buf);
  bufferArgs_.emplace_back(a_2_buf);
  bufferArgs_.emplace_back(b_buf);
  bufferArgs_.emplace_back(scalar_0);
  bufferArgs_.emplace_back(e_0_tensor);
  bufferArgs_.emplace_back(e_1_tensor);
  bufferArgs_.emplace_back(e_2_tensor);

  // NNC CodeGen
  auto codegen_ = CreateCodeGen("cuda_codegen", stmt_, bufferArgs_, at::kCUDA);

  std::cout << codegen_->getCodeText() << std::endl;

  // PyTorch Runtime
  // Inputs
  auto a_0_runtime = at::ones({N, N}, at::kDouble).cuda();
  auto a_1_runtime = at::ones({N, N}, at::kDouble).cuda() * 2.0;
  auto a_2_runtime = at::ones({N, N}, at::kDouble).cuda() * 3.0;
  auto b_runtime = at::ones({N, N}, at::kDouble).cuda();
  auto scalar_0_runtime = 2.0;
  std::vector<c10::IValue> inputs = {a_0_runtime, a_1_runtime, a_2_runtime,
                                     b_runtime, scalar_0_runtime};

  // Outputs
  auto e_0_runtime = codegen_->empty_strided(
      {
          N,
      },
      {
          1,
      },
      c10::kDouble, c10::kStrided, c10::kCUDA, false);

  auto e_1_runtime = codegen_->empty_strided(
      {
          N,
      },
      {
          1,
      },
      c10::kDouble, c10::kStrided, c10::kCUDA, false);

  auto e_2_runtime = codegen_->empty_strided(
      {
          N,
      },
      {
          1,
      },
      c10::kDouble, c10::kStrided, c10::kCUDA, false);

  // Get CodeGen Runtime Arguments
  std::vector<CodeGen::CallArg> runArgs;
  runArgs.reserve(inputs.size() + 2);
  for (auto& input : inputs) {
    if (input.isDouble()) {
      runArgs.emplace_back(input.toDouble());
    } else {
      runArgs.emplace_back(input.toTensor().data_ptr());
    }
  }
  runArgs.emplace_back(e_0_runtime.data_ptr());
  runArgs.emplace_back(e_1_runtime.data_ptr());
  runArgs.emplace_back(e_2_runtime.data_ptr());

  // CUDA KERNEL LAUNCH
  codegen_->call(runArgs);

  // Print Output
  std::cout << e_0_runtime << std::endl;
  std::cout << e_1_runtime << std::endl;
  std::cout << e_2_runtime << std::endl;
  std::cout << "Done!" << std::endl;

  return 0;
}