Add maxlength and stripthink calls, closes #10 and #15

davidmezzetti · davidmezzetti · commit df3719c557e4 · 2025-12-01T10:57:16.000-05:00
diff --git a/app/app.py b/app/app.py
@@ -6,7 +6,6 @@
 """
 
 import os
-import platform
 
 from urllib.parse import urlparse
 
@@ -29,15 +28,12 @@ def __init__(self):
         Creates a new application.
         """
 
+        # Search for additional allowed environment variables
+        kwargs = {x: int(os.environ[x]) for x in ["n_ctx"] if x in os.environ}
+
+        # Annotation instance
         self.annotate = Annotate(
-            os.environ.get(
-                "LLM",
-                (
-                    "NeuML/Llama-3.1_OpenScholar-8B-AWQ"
-                    if platform.machine() in ("x86_64", "AMD")
-                    else "bartowski/Llama-3.1_OpenScholar-8B-GGUF/Llama-3.1_OpenScholar-8B-Q4_K_M.gguf"
-                ),
-            )
+            llm=os.environ.get("LLM", "Qwen/Qwen3-4B-Instruct-2507"), maxlength=int(os.environ.get("MAXLENGTH", 10000)), **kwargs
         )
 
         # Embeddings database for search (lazy loaded)
diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
     package_dir={"": "src/python"},
     keywords="pdf highlight llm ai",
     python_requires=">=3.10",
-    install_requires=["nltk>=3.5", "tqdm>=4.48.0", "txtai>=8.1.0", "txtmarker>=1.1.0"],
+    install_requires=["nltk>=3.5", "tqdm>=4.48.0", "txtai>=9.2.0", "txtmarker>=1.1.0"],
     classifiers=[
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
diff --git a/src/python/annotateai/annotate.py b/src/python/annotateai/annotate.py
@@ -22,16 +22,19 @@ class Annotate(Pipeline):
     Automatically annotates papers using LLMs.
     """
 
-    def __init__(self, llm):
+    def __init__(self, llm, maxlength=10000, **kwargs):
         """
         Creates a new Annotation pipeline.
 
         Args:
             llm: LLM path
+            maxlength: maximum prompt length
+            kwargs: additional LLM args
         """
 
         # Create LLM pipeline
-        self.llm = LLM(llm)
+        self.llm = LLM(llm, **kwargs)
+        self.maxlength = maxlength
 
         # Create segmentation pipeline
         self.segment = Segmentation(sentences=True, cleantext=False)
@@ -141,7 +144,7 @@ def title(self, text, progress):
 
         result = None
         for x in tqdm([prompt], desc="Extracting title", disable=not progress):
-            result = self.llm([{"role": "user", "content": x}], maxlength=2048)
+            result = self.llm([{"role": "user", "content": x}], maxlength=2048, stripthink=True)
 
         return result
 
@@ -165,7 +168,7 @@ def keywords(self, text, progress):
 
         result = None
         for x in tqdm([prompt], desc="Generating keywords", disable=not progress):
-            result = self.llm([{"role": "user", "content": x}], maxlength=2048)
+            result = self.llm([{"role": "user", "content": x}], maxlength=2048, stripthink=True)
 
         return result
 
@@ -234,7 +237,7 @@ def topics(self, annotations, progress):
         topics = []
         for prompt in tqdm(prompts, desc="Generating topics", disable=not progress):
             # Generate topic
-            topic = self.llm([{"role": "user", "content": prompt}], maxlength=10000)
+            topic = self.llm([{"role": "user", "content": prompt}], maxlength=self.maxlength, stripthink=True)
 
             # Clean topic and append
             topics.append(re.sub(r"[^\x00-\x7f]", r"", topic))