-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cpp
More file actions
44 lines (34 loc) · 1.08 KB
/
main.cpp
File metadata and controls
44 lines (34 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#include "tokenizer.h"
#include <cstddef>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string>
#include <vector>
int main() {
RBTokenizer tokenizer(512);
std::string inputFilePath = "data.txt";
std::ifstream file(inputFilePath);
if (!file.is_open()) {
std::cerr << "Failed to open file: " << inputFilePath << std::endl;
return 1;
}
std::string data((std::istreambuf_iterator<char>(file)),
std::istreambuf_iterator<char>());
file.close();
size_t n = data.size();
std::string trainData = data.substr(0, static_cast<size_t>(n * 0.4));
std::cout << "Training data" << std::endl;
tokenizer.train(trainData, 30000);
std::string text = "Before we proceed any further, hear me speak";
std::cout << "Encoding text: " << text << std::endl;
std::vector<int> encoded = tokenizer.encode(text);
std::cout << "Encoded IDs: ";
for (int id : encoded) {
std::cout << id << " ";
}
std::cout << std::endl;
std::string decoded = tokenizer.decode(encoded);
std::cout << "Decoded: " << decoded << std::endl;
return 0;
}