Skip to content

Commit bf7c57c

Browse files
committed
refactored parseUtils
1 parent 4c09c33 commit bf7c57c

File tree

2 files changed

+176
-127
lines changed

2 files changed

+176
-127
lines changed

frontend/degree-plan/pages/OnboardingPage.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ const OnboardingPage = ({
4646
// TRANSCRIPT PARSING
4747
const total = useRef<any>({});
4848
const addText = (items: any[], index: number) => {
49-
const allText: any = parseItems(items, index);
49+
const allText: any = parseItems(items);
5050
let textResult = [];
5151
for (let col in allText) {
5252
let poses = Object.keys(allText[col]).reverse();

frontend/degree-plan/utils/parseUtils.ts

Lines changed: 175 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,25 @@ import { createMajorLabel } from "@/components/FourYearPlan/DegreeModal";
22
import { DegreeListing, SchoolOption } from "@/types";
33
const { closest } = require("fastest-levenshtein");
44

5-
export const parseItems = (items: any[], index: number) => {
5+
type LineItem = {
6+
dir: string;
7+
fontName: string;
8+
hasEOL: boolean;
9+
height: number;
10+
str: string;
11+
transform: number[];
12+
width: number;
13+
};
14+
15+
type DegreeOption = {
16+
value: DegreeListing;
17+
label: string;
18+
};
19+
20+
// Given a list of line items from the PDF, return an object of columns and the lines in each column.
21+
export const parseItems = (items: LineItem[]) => {
622
// At most the transcript will have two columns - we account for that here.
7-
let allText: any = { col0: [], col1: [] };
23+
let allText: { col0: string[][]; col1: string[][] } = { col0: [], col1: [] };
824

925
// Find x value for when second column begins using convenient lines.
1026
let maxCol = items.reduce(function (acc, el) {
@@ -21,7 +37,7 @@ export const parseItems = (items: any[], index: number) => {
2137
let col = items[i]?.transform[4];
2238
let pos = items[i]?.transform[5];
2339

24-
let currentCol = col < maxCol ? "col0" : "col1";
40+
let currentCol: "col0" | "col1" = col < maxCol ? "col0" : "col1";
2541

2642
// Ignore potential high school program transcript
2743
if (items[i].str === "Level:High School") {
@@ -33,14 +49,17 @@ export const parseItems = (items: any[], index: number) => {
3349
else allText[currentCol][pos] = [items[i]?.str];
3450
}
3551

52+
console.log(allText);
3653
return allText;
3754
};
3855

56+
// Given a list of degrees, a list of schools, and a starting year,
57+
// return a list of relevant possible majors.
3958
export const getMajorOptions = (
4059
degrees: DegreeListing[] | undefined,
4160
schools: SchoolOption[],
42-
startingYear: any
43-
) => {
61+
startingYear: number
62+
): DegreeOption[] | undefined => {
4463
const majorOptions = degrees
4564
?.filter((d) => schools.map((s) => s.value).includes(d.degree))
4665
.sort((d) => Math.abs((startingYear ? startingYear : d.year) - d.year))
@@ -52,138 +71,91 @@ export const getMajorOptions = (
5271
return majorOptions;
5372
};
5473

55-
export const parseTranscript = (
56-
textResult: any,
57-
degrees: DegreeListing[] | undefined
58-
) => {
59-
let separatedCourses: any = [];
60-
let startYear: number = 0;
61-
let tempSchools: any = [];
62-
let detectedMajors: string[] = [];
63-
let detectedConcentrations: string[] = [];
64-
65-
for (let l in textResult) {
66-
// SCRAPE SCHOOL
67-
if (textResult[l].replaceAll(" ", "").includes("program:")) {
68-
let program = textResult[l].replace(/^.*?:\s*/, "");
69-
if (program.includes("arts"))
70-
tempSchools.push({ value: "BA", label: "Arts & Sciences" });
71-
72-
if (program.includes("school of engineering and applied science")) {
73-
if (
74-
textResult[parseInt(l) + 1].includes(
75-
"bachelor of science in engineering"
76-
)
77-
)
78-
tempSchools.push({ value: "BSE", label: "Engineering BSE" });
79-
else tempSchools.push({ value: "BAS", label: "Engineering BAS" });
80-
}
81-
if (program.includes("wharton"))
82-
tempSchools.push({ value: "BS", label: "Wharton" });
83-
if (program.includes("nursing"))
84-
tempSchools.push({ value: "BSN", label: "Nursing" });
85-
}
74+
// Given a string[] where we're guaranteed to have a school line, return a list of scraped schools.
75+
const checkSchool = (textResult: string[], l: number) => {
76+
const tempSchools = [];
77+
let program = textResult[l].replace(/^.*?:\s*/, "");
78+
if (program.includes("arts"))
79+
tempSchools.push({ value: "BA", label: "Arts & Sciences" });
80+
if (program.includes("school of engineering and applied science")) {
81+
if (textResult[l + 1].includes("bachelor of science in engineering"))
82+
tempSchools.push({ value: "BSE", label: "Engineering BSE" });
83+
else tempSchools.push({ value: "BAS", label: "Engineering BAS" });
84+
}
85+
if (program.includes("wharton"))
86+
tempSchools.push({ value: "BS", label: "Wharton" });
87+
if (program.includes("nursing"))
88+
tempSchools.push({ value: "BSN", label: "Nursing" });
8689

87-
// SCRAPE MAJOR
88-
if (textResult[l].includes("major")) {
89-
detectedMajors.push(textResult[l].replace(/^.*?:\s*/, ""));
90-
}
90+
return tempSchools;
91+
};
9192

92-
// SCRAPE CONCENTRATION
93-
if (textResult[l].includes("concentration")) {
94-
detectedConcentrations.push(textResult[l].replace(/^.*?:\s*/, ""));
93+
// Given a string[] where we're guaranteed to have a transfer credit line,
94+
// return a list of scraped AP and transfer courses. Stops when we reach potentially
95+
// non-transfer credit lines.
96+
const getAPAndTransferCourses = (textResult: any, l: number) => {
97+
let courses: { [key: string]: string } = {};
98+
let truncatedTranscript = textResult.slice(l + 1);
99+
for (let line of truncatedTranscript) {
100+
// Match lines following course code format
101+
let courseMatch = line.match(/\b\w+\s\d{3,4}\b/);
102+
if (
103+
courseMatch &&
104+
// Match lines following [term] [year] format
105+
!/(fall|spring|summer)\s\d{4}/i.test(courseMatch)
106+
) {
107+
courses[courseMatch[0]] = "_TRAN";
108+
} else if (line.includes("institution credit")) {
109+
break;
95110
}
111+
}
112+
return courses;
113+
};
96114

97-
// SCRAPE AP AND TRANSFER CREDIT
98-
if (textResult[l].includes("transfer credit")) {
99-
let truncatedTranscript = textResult.slice(parseInt(l) + 1);
100-
let courses = [];
101-
for (let line of truncatedTranscript) {
102-
// Match lines following course code format
103-
let courseMatch = line.match(/\b\w+\s\d{3,4}\b/);
104-
if (
105-
courseMatch &&
106-
// Match lines following [term] [year] format
107-
!/(fall|spring|summer)\s\d{4}/i.test(courseMatch)
108-
) {
109-
courses.push(courseMatch[0]);
110-
} else if (line.includes("institution credit")) {
111-
separatedCourses["_TRAN"] = courses;
112-
break;
113-
}
115+
// Given a string[] where what follows is guaranteed to be the student's non-transfer courses,
116+
// return an array of semester + courses objects.
117+
const getCourseToSem = (truncatedTranscript: string[]) => {
118+
let firstNonSummerSemReached = false;
119+
let currentSem = "";
120+
let courseToSem: { [key: string]: string } = {};
121+
for (let line of truncatedTranscript) {
122+
if (/(fall|spring|summer)\s\d{4}/i.test(line)) {
123+
currentSem = line;
124+
if (!firstNonSummerSemReached && !currentSem.includes("summer")) {
125+
firstNonSummerSemReached = true;
114126
}
115-
}
116-
117-
// SCRAPE COURSES (BY SEM)
118-
let firstNonSummerSemReached = false;
119-
120-
let courseToSem: { [key: string]: string } = {};
121-
122-
if (textResult[l].includes("institution credit")) {
123-
let truncatedTranscript = textResult.slice(parseInt(l) + 1);
124-
let currentSem = "";
125-
for (let line of truncatedTranscript) {
126-
if (/(fall|spring|summer)\s\d{4}/i.test(line)) {
127-
currentSem = line;
128-
if (!firstNonSummerSemReached && !currentSem.includes("summer")) {
129-
firstNonSummerSemReached = true;
130-
}
131-
// Only start creating sems after first non-summer semester is reached
132-
if (firstNonSummerSemReached) {
133-
separatedCourses[currentSem] = [];
134-
}
135-
} else {
136-
let courseMatch = line.match(/\b\w+\s\d{3,4}\b/);
137-
138-
if (courseMatch) {
139-
// Check if course didn't get an F or a W. If so, add to current sem or _TRAN
140-
if (!(line[line.length - 1] == "f" || line[line.length - 1] == "w")) {
141-
// TODO: We don't yet have a way to track courses that can be taken multiple times,
142-
// so we store a course that appears multiple times only in the most recent semester it appears in.
143-
if (courseMatch[0] in courseToSem) {
144-
const prevSem = courseToSem[courseMatch[0]];
145-
separatedCourses[currentSem].push(courseMatch[0]);
146-
courseToSem[courseMatch[0]] = currentSem;
147-
separatedCourses[prevSem] = separatedCourses[prevSem].filter((c: string) => c !== courseMatch[0]);
148-
} else {
149-
// Add all pre-college courses to _TRAN semester
150-
if (firstNonSummerSemReached) {
151-
separatedCourses[currentSem].push(courseMatch[0]);
152-
courseToSem[courseMatch[0]] = currentSem;
153-
} else {
154-
separatedCourses["_TRAN"].push(courseMatch[0]);
155-
courseToSem[courseMatch[0]] = "_TRAN";
156-
}
157-
}
127+
} else {
128+
let courseMatch = line.match(/\b[A-Za-z]{2,}\s\d{3,4}\b/);
129+
if (courseMatch) {
130+
// Check if course didn't get an F or a W. If so, add to current sem or _TRAN
131+
if (!(line[line.length - 1] == "f" || line[line.length - 1] == "w")) {
132+
// TODO: We don't yet have a way to track courses that can be taken multiple times,
133+
// so we store a course that appears multiple times only in the most recent semester it appears in.
134+
if (courseMatch[0] in courseToSem) {
135+
courseToSem[courseMatch[0]] = currentSem;
136+
} else {
137+
// Add all pre-college courses to _TRAN semester
138+
if (firstNonSummerSemReached) {
139+
courseToSem[courseMatch[0]] = currentSem;
140+
} else {
141+
courseToSem[courseMatch[0]] = "_TRAN";
158142
}
159143
}
160144
}
161145
}
162-
163-
// Remove any empty semesters (handles edge case where user fails/withdraws from all courses in a semester)
164-
for (let sem of Object.keys(separatedCourses)) {
165-
if (separatedCourses[sem].length == 0) {
166-
delete separatedCourses[sem];
167-
}
168-
}
169-
170-
separatedCourses = Object.keys(separatedCourses).map(
171-
(key) => ({ sem: key, courses: separatedCourses[key] })
172-
);
173-
174-
// SCRAPE START YEAR AND INFER GRAD YEAR
175-
let years = separatedCourses.map((e: any, i: number) => {
176-
return parseInt(e.sem.replace(/\D/g, ""));
177-
});
178-
years.shift();
179-
startYear = Math.min(...years);
180146
}
181147
}
148+
return courseToSem;
149+
};
182150

183-
// Match majors
184-
let detectedMajorsOptions = [];
185-
186-
let possibleDegrees = getMajorOptions(degrees, tempSchools, startYear);
151+
// Given a list of detected majors, a list of detected concentrations, and a list of possible degrees,
152+
// return a list of detected majors options.
153+
const detectMajors = (
154+
detectedMajors: string[],
155+
detectedConcentrations: string[],
156+
possibleDegrees: DegreeOption[] | undefined
157+
) => {
158+
const detectedMajorsOptions = [];
187159
for (let i in detectedMajors) {
188160
let m =
189161
detectedMajors[i] +
@@ -206,8 +178,85 @@ export const parseTranscript = (
206178
if (majorOption) detectedMajorsOptions.push(majorOption);
207179
}
208180
}
181+
182+
return detectedMajorsOptions;
183+
};
184+
185+
// Given a list of lines from the PDF and a list of possible degrees,
186+
// return a scraped information.
187+
export const parseTranscript = (
188+
textResult: string[],
189+
degrees: DegreeListing[] | undefined
190+
) => {
191+
let courseToSem: { [key: string]: string } = {};
192+
let startYear: number = 0;
193+
let tempSchools: { value: string; label: string }[] = [];
194+
let detectedMajors: string[] = [];
195+
let detectedConcentrations: string[] = [];
196+
197+
for (let l = 0; l < textResult.length; l++) {
198+
if (textResult[l].replaceAll(" ", "").includes("program:")) {
199+
tempSchools = tempSchools.concat(checkSchool(textResult, l));
200+
}
201+
202+
if (textResult[l].includes("major")) {
203+
detectedMajors.push(textResult[l].replace(/^.*?:\s*/, ""));
204+
}
205+
206+
if (textResult[l].includes("concentration")) {
207+
detectedConcentrations.push(textResult[l].replace(/^.*?:\s*/, ""));
208+
}
209+
210+
if (textResult[l].includes("transfer credit")) {
211+
courseToSem = {
212+
...courseToSem,
213+
...getAPAndTransferCourses(textResult, l),
214+
};
215+
}
216+
217+
if (textResult[l].includes("institution credit")) {
218+
courseToSem = {
219+
...courseToSem,
220+
...getCourseToSem(textResult.slice(l + 1)),
221+
};
222+
}
223+
}
224+
225+
const separatedCourses = Object.entries(courseToSem).reduce(
226+
(acc, [course, sem]) => {
227+
const trimmedSem = sem.trim();
228+
if (!acc[trimmedSem]) acc[trimmedSem] = [];
229+
acc[trimmedSem].push(course);
230+
return acc;
231+
},
232+
{} as { [key: string]: string[] }
233+
);
234+
235+
const formattedSeparatedCourses = Object.entries(separatedCourses).map(
236+
([sem, courses]) => ({
237+
sem,
238+
courses,
239+
})
240+
);
241+
242+
// Scrape start year and infer grad year
243+
let years = formattedSeparatedCourses.map(
244+
(e: { sem: string; courses: string[] }, i: number) => {
245+
return parseInt(e.sem.replace(/\D/g, ""));
246+
}
247+
);
248+
years.shift();
249+
startYear = Math.min(...years);
250+
251+
let possibleDegrees = getMajorOptions(degrees, tempSchools, startYear);
252+
let detectedMajorsOptions = detectMajors(
253+
detectedMajors,
254+
detectedConcentrations,
255+
possibleDegrees
256+
);
257+
209258
return {
210-
scrapedCourses: separatedCourses,
259+
scrapedCourses: formattedSeparatedCourses,
211260
startYear: startYear,
212261
scrapedSchools: tempSchools,
213262
detectedMajorsOptions: detectedMajorsOptions,

0 commit comments

Comments
 (0)