Skip to content

Commit f0f64fd

Browse files
Merge pull request #21 from feast-dev/rag
chore: Updating the chunking to do sentence chunking
2 parents c5af6c5 + 7a76a33 commit f0f64fd

File tree

2 files changed

+27
-11
lines changed

2 files changed

+27
-11
lines changed

module_4_rag/batch_score_documents.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
import os
22
import pandas as pd
3+
from nltk.tokenize import sent_tokenize
34
from transformers import AutoTokenizer, AutoModel
45
import torch
56
import torch.nn.functional as F
67

7-
INPUT_FILENAME = "./data/city_wikipedia_summaries.csv"
8-
EXPORT_FILENAME = "./data/city_wikipedia_summaries_with_embeddings.parquet"
8+
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "feature_repo"))
9+
DATA_DIR = os.path.join(BASE_DIR, "data")
10+
INPUT_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries.csv")
11+
CHUNKED_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries_chunked.csv")
12+
EXPORT_FILENAME = os.path.join(
13+
DATA_DIR, "city_wikipedia_summaries_with_embeddings.parquet"
14+
)
915
TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
1016
MODEL = "sentence-transformers/all-MiniLM-L6-v2"
1117

@@ -36,23 +42,33 @@ def run_model(sentences, tokenizer, model):
3642

3743

3844
def score_data() -> None:
39-
if EXPORT_FILENAME not in os.listdir():
40-
print("scored data not found...generating embeddings...")
41-
df = pd.read_csv(INPUT_FILENAME)
45+
os.makedirs(DATA_DIR, exist_ok=True)
46+
47+
if not os.path.exists(EXPORT_FILENAME):
48+
print("Scored data not found... generating embeddings...")
49+
50+
if not os.path.exists(CHUNKED_FILENAME):
51+
print("Chunked data not found... generating chunked data...")
52+
df = pd.read_csv(INPUT_FILENAME)
53+
df["Sentence Chunks"] = df["Wiki Summary"].apply(lambda x: sent_tokenize(x))
54+
chunked_df = df.explode("Sentence Chunks")
55+
chunked_df.to_csv(CHUNKED_FILENAME, index=False)
56+
df = chunked_df
57+
else:
58+
df = pd.read_csv(CHUNKED_FILENAME)
59+
4260
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
4361
model = AutoModel.from_pretrained(MODEL)
4462
embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
45-
print(embeddings)
46-
print("shape = ", df.shape)
47-
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
4863
print("embeddings generated...")
64+
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
4965
df["event_timestamp"] = pd.to_datetime("today")
5066
df["item_id"] = df.index
51-
print(df.head())
67+
5268
df.to_parquet(EXPORT_FILENAME, index=False)
53-
print("...data exported. job complete")
69+
print("...data exported. Job complete")
5470
else:
55-
print("scored data found...skipping generating embeddings.")
71+
print("Scored data found... skipping generating embeddings.")
5672

5773

5874
if __name__ == "__main__":
Binary file not shown.

0 commit comments

Comments
 (0)