From 22575d7dce6409f16ebfc5d6f893f4d9115e1aeb Mon Sep 17 00:00:00 2001 From: Keyu Chen <54015474+km5ar@users.noreply.github.com> Date: Tue, 29 Oct 2024 22:36:19 -0400 Subject: [PATCH] fix "INPUT_FILE" path and missing variable "processed_text" --- .../Step-1 PDF-Pre-Processing-Logic.ipynb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb index 2cf5d38d3..b3a747cf6 100644 --- a/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb +++ b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb @@ -464,12 +464,12 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "a0183c47-339d-4041-ae83-77fc34931075", "metadata": {}, "outputs": [], "source": [ - "INPUT_FILE = \"./resources/extracted_text.txt\" # Replace with your file path\n", + "INPUT_FILE = \"extracted_text.txt\" # Replace with your file path\n", "CHUNK_SIZE = 1000 # Adjust chunk size if needed\n", "\n", "chunks = create_word_bounded_chunks(text, CHUNK_SIZE)\n", @@ -518,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "7917dfdd-b3af-44fc-a8c0-2760ace9363e", "metadata": {}, "outputs": [ @@ -2616,15 +2616,18 @@ } ], "source": [ + "# Initialize processed_text before using it\n", + "processed_text = \"\"\n", + "\n", "with open(output_file, 'w', encoding='utf-8') as out_file:\n", " for chunk_num, chunk in enumerate(tqdm(chunks, desc=\"Processing chunks\")):\n", " # Process chunk and append to complete text\n", " processed_chunk = process_chunk(chunk, chunk_num)\n", " processed_text += processed_chunk + \"\\n\"\n", - " \n", + "\n", " # Write chunk immediately to file\n", " out_file.write(processed_chunk + \"\\n\")\n", - " out_file.flush()" + " out_file.flush()\n" ] }, {