Skip to content

Commit df3719c

Browse files
committed
Add maxlength and stripthink calls, closes #10 and #15
1 parent cbb8ca6 commit df3719c

File tree

3 files changed

+14
-15
lines changed

3 files changed

+14
-15
lines changed

app/app.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"""
77

88
import os
9-
import platform
109

1110
from urllib.parse import urlparse
1211

@@ -29,15 +28,12 @@ def __init__(self):
2928
Creates a new application.
3029
"""
3130

31+
# Search for additional allowed environment variables
32+
kwargs = {x: int(os.environ[x]) for x in ["n_ctx"] if x in os.environ}
33+
34+
# Annotation instance
3235
self.annotate = Annotate(
33-
os.environ.get(
34-
"LLM",
35-
(
36-
"NeuML/Llama-3.1_OpenScholar-8B-AWQ"
37-
if platform.machine() in ("x86_64", "AMD")
38-
else "bartowski/Llama-3.1_OpenScholar-8B-GGUF/Llama-3.1_OpenScholar-8B-Q4_K_M.gguf"
39-
),
40-
)
36+
llm=os.environ.get("LLM", "Qwen/Qwen3-4B-Instruct-2507"), maxlength=int(os.environ.get("MAXLENGTH", 10000)), **kwargs
4137
)
4238

4339
# Embeddings database for search (lazy loaded)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
package_dir={"": "src/python"},
2424
keywords="pdf highlight llm ai",
2525
python_requires=">=3.10",
26-
install_requires=["nltk>=3.5", "tqdm>=4.48.0", "txtai>=8.1.0", "txtmarker>=1.1.0"],
26+
install_requires=["nltk>=3.5", "tqdm>=4.48.0", "txtai>=9.2.0", "txtmarker>=1.1.0"],
2727
classifiers=[
2828
"License :: OSI Approved :: Apache Software License",
2929
"Operating System :: OS Independent",

src/python/annotateai/annotate.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,19 @@ class Annotate(Pipeline):
2222
Automatically annotates papers using LLMs.
2323
"""
2424

25-
def __init__(self, llm):
25+
def __init__(self, llm, maxlength=10000, **kwargs):
2626
"""
2727
Creates a new Annotation pipeline.
2828
2929
Args:
3030
llm: LLM path
31+
maxlength: maximum prompt length
32+
kwargs: additional LLM args
3133
"""
3234

3335
# Create LLM pipeline
34-
self.llm = LLM(llm)
36+
self.llm = LLM(llm, **kwargs)
37+
self.maxlength = maxlength
3538

3639
# Create segmentation pipeline
3740
self.segment = Segmentation(sentences=True, cleantext=False)
@@ -141,7 +144,7 @@ def title(self, text, progress):
141144

142145
result = None
143146
for x in tqdm([prompt], desc="Extracting title", disable=not progress):
144-
result = self.llm([{"role": "user", "content": x}], maxlength=2048)
147+
result = self.llm([{"role": "user", "content": x}], maxlength=2048, stripthink=True)
145148

146149
return result
147150

@@ -165,7 +168,7 @@ def keywords(self, text, progress):
165168

166169
result = None
167170
for x in tqdm([prompt], desc="Generating keywords", disable=not progress):
168-
result = self.llm([{"role": "user", "content": x}], maxlength=2048)
171+
result = self.llm([{"role": "user", "content": x}], maxlength=2048, stripthink=True)
169172

170173
return result
171174

@@ -234,7 +237,7 @@ def topics(self, annotations, progress):
234237
topics = []
235238
for prompt in tqdm(prompts, desc="Generating topics", disable=not progress):
236239
# Generate topic
237-
topic = self.llm([{"role": "user", "content": prompt}], maxlength=10000)
240+
topic = self.llm([{"role": "user", "content": prompt}], maxlength=self.maxlength, stripthink=True)
238241

239242
# Clean topic and append
240243
topics.append(re.sub(r"[^\x00-\x7f]", r"", topic))

0 commit comments

Comments
 (0)