Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions apps/spider/crawlers/offerings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import re
from urllib.parse import urljoin

from apps.spider.utils import retrieve_soup
from apps.web.models import Course, CourseOffering, Instructor

BASE_URL = "https://gc.sjtu.edu.cn/"
OFFERINGS_URL = urljoin(BASE_URL, "/academics/courses/present-course-offerings/")

TERM_CODES = {
"spring": "S",
"summer": "X",
"fall": "F",
"winter": "W",
}


def parse_offering_term(text):
match = re.search(r"(Spring|Summer|Fall|Winter)\s+(\d{4})", text, re.I)
if not match:
return None

season, year = match.groups()
return f"{year[-2:]}{TERM_CODES[season.lower()]}"


def normalize_course_code(raw_course_code):
match = re.match(
r"^(?P<department>[A-Z]{2,4})(?P<number>\d{3,4}J?)",
raw_course_code.strip(),
)
if not match:
return None, None, None

department = match.group("department")
number_text = match.group("number").removesuffix("J")
return f"{department}{match.group('number')}", department, int(number_text)


def parse_instructors(text):
names = re.split(r";|,|,|\band\b", text)
return [name.strip() for name in names if name.strip()]


def crawl_offerings(url=OFFERINGS_URL):
soup = retrieve_soup(url)
offering_data = []

term_headings = [
heading
for heading in soup.find_all("h1")
if "Courses Offered in" in heading.get_text(" ", strip=True)
]

for heading in term_headings:
term = parse_offering_term(heading.get_text(" ", strip=True))
table = heading.find_next("table")
if not term or table is None:
continue

current_record = None
for row in table.find_all("tr")[1:]:
cells = [cell.get_text(" ", strip=True) for cell in row.find_all("td")]
if not cells:
continue

if len(cells) >= 5:
course_code, department, number = normalize_course_code(cells[0])
if not course_code:
current_record = None
continue

current_record = {
"term": term,
"course_code": course_code,
"department": department,
"number": number,
"course_title_zh": cells[1],
"course_title": cells[2],
"course_credits": int(cells[3]) if cells[3].isdigit() else 0,
"instructors": parse_instructors(cells[4]),
}
offering_data.append(current_record)
elif len(cells) == 1 and current_record is not None:
current_record["instructors"].extend(parse_instructors(cells[0]))

return offering_data


def import_offerings(offering_data):
for offering in offering_data:
if not offering:
continue

course, created = Course.objects.get_or_create(
course_code=offering["course_code"],
defaults={
"course_title": offering["course_title"][:100],
"department": offering["department"],
"number": offering["number"],
"course_credits": offering["course_credits"],
},
)
if not created:
course.course_credits = offering["course_credits"]
course.save(update_fields=["course_credits", "updated_at"])

course_offering, _ = CourseOffering.objects.get_or_create(
course=course,
term=offering["term"],
section=1,
defaults={"period": "", "limit": None},
)

instructors = [
Instructor.objects.get_or_create(name=name)[0]
for name in offering.get("instructors", [])
]
course_offering.instructors.set(instructors)
24 changes: 18 additions & 6 deletions apps/spider/crawlers/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from apps.web.models import Course, CourseOffering, Instructor
from lib.constants import CURRENT_TERM

BASE_URL = "https://www.ji.sjtu.edu.cn/"
BASE_URL = "https://gc.sjtu.edu.cn/"
ORC_BASE_URL = urljoin(BASE_URL, "/academics/courses/courses-by-number/")
# ORC_UNDERGRAD_SUFFIX = "Departments-Programs-Undergraduate"
# ORC_GRADUATE_SUFFIX = "Departments-Programs-Graduate"
COURSE_DETAIL_URL_PREFIX = (
"https://www.ji.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id="
"https://gc.sjtu.edu.cn/academics/courses/courses-by-number/course-info/?id="
)
UNDERGRAD_URL = ORC_BASE_URL
INSTRUCTOR_TERM_REGEX = re.compile(r"^(?P<name>\w*)\s?(\((?P<term>\w*)\))?")
Expand Down Expand Up @@ -68,9 +68,17 @@ def _crawl_course_data(course_url):
split_course_heading = course_heading.split(" – ")
children = list(soup.find_all(class_="et_pb_text_inner")[3].children)

course_code = split_course_heading[0]
department = re.findall(r"^([A-Z]{2,4})\d+", course_code)[0]
number = re.findall(r"^[A-Z]{2,4}(\d{3})", course_code)[0]
raw_course_code = split_course_heading[0].strip()
course_code_match = re.match(
r"^(?P<department>[A-Z]{2,4})(?P<number>\d{3,4}J?)", raw_course_code
)
if not course_code_match:
return None

department = course_code_match.group("department")
number_text = course_code_match.group("number").removesuffix("J")
number = int(number_text)
course_code = f"{department}{course_code_match.group('number')}"
course_title = split_course_heading[1]

course_credits = 0
Expand All @@ -82,7 +90,8 @@ def _crawl_course_data(course_url):
for i, child in enumerate(children):
text = child.get_text(strip=True) if hasattr(child, "get_text") else ""
if "Credits:" in text:
course_credits = int(re.findall(r"\d+", text)[0])
credits_match = re.search(r"Credits:\s*(\d+)", text)
course_credits = int(credits_match.group(1)) if credits_match else 0
elif "Pre-requisites:" in text:
pre_requisites = extract_prerequisites(text)
elif "Description:" in text:
Expand Down Expand Up @@ -138,6 +147,9 @@ def _crawl_course_data(course_url):

def import_department(department_data):
for course_data in department_data:
if not course_data:
continue

course, created = Course.objects.update_or_create(
course_code=course_data["course_code"],
defaults={
Expand Down
26 changes: 26 additions & 0 deletions apps/spider/migrations/0003_add_course_offerings_data_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Codex on 2026-05-13

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("spider", "0002_alter_crawleddata_current_data_and_more"),
]

operations = [
migrations.AlterField(
model_name="crawleddata",
name="data_type",
field=models.CharField(
choices=[
("medians", "Medians"),
("orc_department_courses", "ORC Department Courses"),
("course_timetable", "Course Timetable"),
("course_offerings", "Course Offerings"),
],
default="",
max_length=32,
),
),
]
7 changes: 6 additions & 1 deletion apps/spider/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ class CrawledData(models.Model):
MEDIANS = "medians"
ORC_DEPARTMENT_COURSES = "orc_department_courses"
COURSE_TIMETABLE = "course_timetable"
COURSE_OFFERINGS = "course_offerings"
DATA_TYPE_CHOICES = (
(MEDIANS, "Medians"),
(ORC_DEPARTMENT_COURSES, "ORC Department Courses"),
(COURSE_TIMETABLE, "Course Timetable"),
(COURSE_OFFERINGS, "Course Offerings"),
)
objects = CrawledDataManager()

Expand Down Expand Up @@ -102,4 +104,7 @@ def email_change(self):
def approve_change(self):
from apps.spider.tasks import import_pending_crawled_data

import_pending_crawled_data.delay(self.pk)
if settings.DEBUG or import_pending_crawled_data.app.conf.task_always_eager:
import_pending_crawled_data(self.pk)
else:
import_pending_crawled_data.delay(self.pk)
31 changes: 27 additions & 4 deletions apps/spider/tasks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# from apps.spider.crawlers import medians, orc, timetable
# from apps.spider.crawlers import medians, timetable
from celery import shared_task
from django.db import transaction

from apps.spider.crawlers import orc
from apps.spider.crawlers import offerings, orc
from apps.spider.models import CrawledData
from lib import task_utils

Expand All @@ -20,6 +20,8 @@ def import_pending_crawled_data(crawled_data_pk):
# elif
if crawled_data.data_type == CrawledData.ORC_DEPARTMENT_COURSES:
orc.import_department(crawled_data.pending_data)
elif crawled_data.data_type == CrawledData.COURSE_OFFERINGS:
offerings.import_offerings(crawled_data.pending_data)
# else:
# assert crawled_data.data_type == CrawledData.COURSE_TIMETABLE
# timetable.import_timetable(crawled_data.pending_data)
Expand Down Expand Up @@ -55,11 +57,32 @@ def crawl_orc():
program_urls = orc.crawl_program_urls()
print(f"Found {len(program_urls)} program URLs")
# assert len(program_urls) > 50
for url in program_urls:
crawl_program_url.delay(url)
new_data = [
course_data
for course_data in (orc._crawl_course_data(url) for url in sorted(program_urls))
if course_data
]
CrawledData.objects.handle_new_crawled_data(
new_data,
"orc_department_courses",
CrawledData.ORC_DEPARTMENT_COURSES,
)
return sorted(program_urls)


@shared_task
@task_utils.email_if_fails
def crawl_offerings():
print("Starting crawl_offerings")
new_data = offerings.crawl_offerings()
CrawledData.objects.handle_new_crawled_data(
new_data,
"present_course_offerings",
CrawledData.COURSE_OFFERINGS,
)
return new_data


@shared_task
@task_utils.email_if_fails
def crawl_program_url(url, program_code=None):
Expand Down
Loading
Loading