Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 95 additions & 1 deletion hypothesis_agent/literature_review_agent/literature_review.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import os
import json
import shutil
import subprocess
import time
import requests
from string import Template
from typing import Dict, List, Union
from hypogenic.algorithm.generation.utils import extract_hypotheses
from hypogenic.LLM_wrapper import LLMWrapper
from hypogenic.logger_config import LoggerConfig

from .literature_processor.extract_info import BaseExtractor
from .literature_processor.summarize import BaseSummarize
from ..data_analysis_agent.prompt import TestPrompt

from .literature_search import auto_literature_search
from doc2json.grobid2json.process_pdf import process_pdf_file

class LiteratureAgent:
def __init__(
Expand Down Expand Up @@ -55,3 +60,92 @@ def refine_hypotheses(
)

return extract_hypotheses(response, len(hypotheses_list))

def auto_process_literature(
self,
num_papers: int = 10,
num_papers_per_trial: int = 10,
max_search_trial: int = 5,
cache_seed = None,
**generate_kwargs,
):
logger = LoggerConfig.get_logger("automated literature processing")
task_name = self.prompt_class.task.task_name
if "automated_literature_search_topic" in self.prompt_class.task.prompt_template:
search_topic = self.prompt_class.task.prompt_template["automated_literature_search_topic"]
else:
search_topic = task_name
save_dir = auto_literature_search(
topic=search_topic,
num_papers=num_papers,
task_name=task_name,
num_papers_per_trial=num_papers_per_trial,
max_trial=max_search_trial,
)
if save_dir == "":
return
logger.info(f"automated literature search finished")

run_grobid_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))), f"modules/run_grobid.sh"
)

if not os.path.exists(run_grobid_path):
logger.error(f"Need to set up grobid first. Please run bash modules/setup_grobid.sh")
return

logger.info("running grobid")
grobid_process = subprocess.Popen(
["bash", run_grobid_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
time.sleep(10)
for i in range(30):
try:
response = requests.get("http://localhost:8070/api/isalive", timeout=2)
if response.status_code == 200:
logger.info("Grobid is ready!")
break
except requests.exceptions.RequestException:
pass
time.sleep(2)
else:
logger.error("Grobid did not become ready in time.")
grobid_process.terminate()
return

try:
raw_literature_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))), f"literature/{task_name}/raw"
)
processed_literature_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))), f"literature/{task_name}/processed"
)
os.makedirs(processed_literature_dir, exist_ok=True)

for root, dirs, files in os.walk(raw_literature_dir):
for file in files:
file_path = os.path.join(root, file)
try:
process_pdf_file(file_path, "./tmp_dir", processed_literature_dir)
except ConnectionError:
logger.warning("Grobid service is not running, please run ./run_grobid.sh first.")
shutil.rmtree("./tmp_dir")
logger.info("PDF preprocessing completed!")

except Exception as e:
logger.warning(f"Error when processing paper PDFs with grobid: {e}")
return

finally:
logger.info("stopping grobid process")
grobid_process.terminate()
grobid_process.wait()

self.summarize_papers(
data_file=processed_literature_dir,
cache_seed=cache_seed,
**generate_kwargs,
)
grobid_process.terminate()
Comment on lines +150 to +151
Copy link

Copilot AI Jun 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The grobid process is terminated in the finally block earlier (lines 142-144) and then terminated again on line 151; remove the redundant termination to simplify the flow.

Suggested change
)
grobid_process.terminate()
)

Copilot uses AI. Check for mistakes.
88 changes: 88 additions & 0 deletions hypothesis_agent/literature_review_agent/literature_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import requests
from tqdm import tqdm
from hypogenic.logger_config import LoggerConfig

def auto_literature_search(
topic: str,
num_papers: int,
task_name: str = None,
num_papers_per_trial: int = 10,
max_trial: int = 5,
):
Copy link

Copilot AI Jun 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logger is used before it is defined (it is instantiated on line 20), which may result in a NameError. Consider moving the logger initialization to the beginning of the function.

Suggested change
):
):
logger = LoggerConfig.get_logger("auto-literature-search")

Copilot uses AI. Check for mistakes.
if task_name is None:
logger.warning(f"need to specify task name")
return ""
save_dir = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))), f"literature/{task_name}/raw"
)
os.makedirs(save_dir, exist_ok=True)
logger = LoggerConfig.get_logger("auto-literature-search")

logger.info(f"searching for papers on topic {topic}")

params = {
"query": topic,
"limit": num_papers_per_trial,
"fields": "title,openAccessPdf",
"offset": 0,
}

API_URL = "https://api.semanticscholar.org/graph/v1/paper/search"

headers = {
"x-api-key": os.getenv("SS_API_KEY"),
}

papers_list = []
cnt_papers = 0
for n_trial in range(max_trial):
if cnt_papers >= num_papers:
break
params["offset"] = n_trial * num_papers_per_trial
response = requests.get(API_URL, params=params, headers=headers)
if response.status_code != 200:
logger.warning(f"Error when querying Semantic Scholar API. Message: {response.text}")
return ""

papers = response.json()["data"]
logger.info(f"found {len(papers)} papers for trial #{n_trial}, starting download")

for idx, paper in enumerate(tqdm(papers)):
if cnt_papers >= num_papers:
break
title = paper.get("title", f"paper_{idx}")
logger.info(f"found paper with title {title}")
pdf_info = paper.get("openAccessPdf")
if pdf_info and pdf_info.get("url"):
pdf_url = pdf_info["url"]
try:
pdf_response = requests.get(pdf_url, timeout=10, stream=True)
if pdf_response.status_code == 200:
content_type = pdf_response.headers.get('Content-Type', '').lower()
content_length = int(pdf_response.headers.get('Content-Length', '0'))

if 'pdf' in content_type and content_length > 10 * 1024: # at least 10 KB
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)
filename = safe_title.strip().replace(" ", "_")[:80] + ".pdf"
filepath = os.path.join(save_dir, filename)
papers_list.append(filename)

with open(filepath, "wb") as f:
for chunk in pdf_response.iter_content(chunk_size=8192):
f.write(chunk)
cnt_papers += 1
logger.info(f"Successfully downloaded PDF for paper {title}")
else:
logger.info(f"Skipped (not a valid or too-small PDF): {pdf_url} (size: {content_length} bytes)")
else:
logger.info(f"Failed to download {title} (HTTP {pdf_response.status_code})")
except Exception as e:
logger.info(f"Error downloading {title}: {e}")
else:
logger.info(f"No open access PDF available for '{title}'.")

logger.info(f"Completed automated literature search, downloaded {cnt_papers} papers:")
for paper in papers_list:
logger.info(paper)
return save_dir