@@ -2,9 +2,25 @@ import { createMajorLabel } from "@/components/FourYearPlan/DegreeModal";
22import { DegreeListing , SchoolOption } from "@/types" ;
33const { closest } = require ( "fastest-levenshtein" ) ;
44
5- export const parseItems = ( items : any [ ] , index : number ) => {
5+ type LineItem = {
6+ dir : string ;
7+ fontName : string ;
8+ hasEOL : boolean ;
9+ height : number ;
10+ str : string ;
11+ transform : number [ ] ;
12+ width : number ;
13+ } ;
14+
15+ type DegreeOption = {
16+ value : DegreeListing ;
17+ label : string ;
18+ } ;
19+
20+ // Given a list of line items from the PDF, return an object of columns and the lines in each column.
21+ export const parseItems = ( items : LineItem [ ] ) => {
622 // At most the transcript will have two columns - we account for that here.
7- let allText : any = { col0 : [ ] , col1 : [ ] } ;
23+ let allText : { col0 : string [ ] [ ] ; col1 : string [ ] [ ] } = { col0 : [ ] , col1 : [ ] } ;
824
925 // Find x value for when second column begins using convenient lines.
1026 let maxCol = items . reduce ( function ( acc , el ) {
@@ -21,7 +37,7 @@ export const parseItems = (items: any[], index: number) => {
2137 let col = items [ i ] ?. transform [ 4 ] ;
2238 let pos = items [ i ] ?. transform [ 5 ] ;
2339
24- let currentCol = col < maxCol ? "col0" : "col1" ;
40+ let currentCol : "col0" | "col1" = col < maxCol ? "col0" : "col1" ;
2541
2642 // Ignore potential high school program transcript
2743 if ( items [ i ] . str === "Level:High School" ) {
@@ -33,14 +49,17 @@ export const parseItems = (items: any[], index: number) => {
3349 else allText [ currentCol ] [ pos ] = [ items [ i ] ?. str ] ;
3450 }
3551
52+ console . log ( allText ) ;
3653 return allText ;
3754} ;
3855
56+ // Given a list of degrees, a list of schools, and a starting year,
57+ // return a list of relevant possible majors.
3958export const getMajorOptions = (
4059 degrees : DegreeListing [ ] | undefined ,
4160 schools : SchoolOption [ ] ,
42- startingYear : any
43- ) => {
61+ startingYear : number
62+ ) : DegreeOption [ ] | undefined => {
4463 const majorOptions = degrees
4564 ?. filter ( ( d ) => schools . map ( ( s ) => s . value ) . includes ( d . degree ) )
4665 . sort ( ( d ) => Math . abs ( ( startingYear ? startingYear : d . year ) - d . year ) )
@@ -52,138 +71,91 @@ export const getMajorOptions = (
5271 return majorOptions ;
5372} ;
5473
55- export const parseTranscript = (
56- textResult : any ,
57- degrees : DegreeListing [ ] | undefined
58- ) => {
59- let separatedCourses : any = [ ] ;
60- let startYear : number = 0 ;
61- let tempSchools : any = [ ] ;
62- let detectedMajors : string [ ] = [ ] ;
63- let detectedConcentrations : string [ ] = [ ] ;
64-
65- for ( let l in textResult ) {
66- // SCRAPE SCHOOL
67- if ( textResult [ l ] . replaceAll ( " " , "" ) . includes ( "program:" ) ) {
68- let program = textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ;
69- if ( program . includes ( "arts" ) )
70- tempSchools . push ( { value : "BA" , label : "Arts & Sciences" } ) ;
71-
72- if ( program . includes ( "school of engineering and applied science" ) ) {
73- if (
74- textResult [ parseInt ( l ) + 1 ] . includes (
75- "bachelor of science in engineering"
76- )
77- )
78- tempSchools . push ( { value : "BSE" , label : "Engineering BSE" } ) ;
79- else tempSchools . push ( { value : "BAS" , label : "Engineering BAS" } ) ;
80- }
81- if ( program . includes ( "wharton" ) )
82- tempSchools . push ( { value : "BS" , label : "Wharton" } ) ;
83- if ( program . includes ( "nursing" ) )
84- tempSchools . push ( { value : "BSN" , label : "Nursing" } ) ;
85- }
74+ // Given a string[] where we're guaranteed to have a school line, return a list of scraped schools.
75+ const checkSchool = ( textResult : string [ ] , l : number ) => {
76+ const tempSchools = [ ] ;
77+ let program = textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ;
78+ if ( program . includes ( "arts" ) )
79+ tempSchools . push ( { value : "BA" , label : "Arts & Sciences" } ) ;
80+ if ( program . includes ( "school of engineering and applied science" ) ) {
81+ if ( textResult [ l + 1 ] . includes ( "bachelor of science in engineering" ) )
82+ tempSchools . push ( { value : "BSE" , label : "Engineering BSE" } ) ;
83+ else tempSchools . push ( { value : "BAS" , label : "Engineering BAS" } ) ;
84+ }
85+ if ( program . includes ( "wharton" ) )
86+ tempSchools . push ( { value : "BS" , label : "Wharton" } ) ;
87+ if ( program . includes ( "nursing" ) )
88+ tempSchools . push ( { value : "BSN" , label : "Nursing" } ) ;
8689
87- // SCRAPE MAJOR
88- if ( textResult [ l ] . includes ( "major" ) ) {
89- detectedMajors . push ( textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ) ;
90- }
90+ return tempSchools ;
91+ } ;
9192
92- // SCRAPE CONCENTRATION
93- if ( textResult [ l ] . includes ( "concentration" ) ) {
94- detectedConcentrations . push ( textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ) ;
93+ // Given a string[] where we're guaranteed to have a transfer credit line,
94+ // return a list of scraped AP and transfer courses. Stops when we reach potentially
95+ // non-transfer credit lines.
96+ const getAPAndTransferCourses = ( textResult : any , l : number ) => {
97+ let courses : { [ key : string ] : string } = { } ;
98+ let truncatedTranscript = textResult . slice ( l + 1 ) ;
99+ for ( let line of truncatedTranscript ) {
100+ // Match lines following course code format
101+ let courseMatch = line . match ( / \b \w + \s \d { 3 , 4 } \b / ) ;
102+ if (
103+ courseMatch &&
104+ // Match lines following [term] [year] format
105+ ! / ( f a l l | s p r i n g | s u m m e r ) \s \d { 4 } / i. test ( courseMatch )
106+ ) {
107+ courses [ courseMatch [ 0 ] ] = "_TRAN" ;
108+ } else if ( line . includes ( "institution credit" ) ) {
109+ break ;
95110 }
111+ }
112+ return courses ;
113+ } ;
96114
97- // SCRAPE AP AND TRANSFER CREDIT
98- if ( textResult [ l ] . includes ( "transfer credit" ) ) {
99- let truncatedTranscript = textResult . slice ( parseInt ( l ) + 1 ) ;
100- let courses = [ ] ;
101- for ( let line of truncatedTranscript ) {
102- // Match lines following course code format
103- let courseMatch = line . match ( / \b \w + \s \d { 3 , 4 } \b / ) ;
104- if (
105- courseMatch &&
106- // Match lines following [term] [year] format
107- ! / ( f a l l | s p r i n g | s u m m e r ) \s \d { 4 } / i. test ( courseMatch )
108- ) {
109- courses . push ( courseMatch [ 0 ] ) ;
110- } else if ( line . includes ( "institution credit" ) ) {
111- separatedCourses [ "_TRAN" ] = courses ;
112- break ;
113- }
115+ // Given a string[] where what follows is guaranteed to be the student's non-transfer courses,
116+ // return an array of semester + courses objects.
117+ const getCourseToSem = ( truncatedTranscript : string [ ] ) => {
118+ let firstNonSummerSemReached = false ;
119+ let currentSem = "" ;
120+ let courseToSem : { [ key : string ] : string } = { } ;
121+ for ( let line of truncatedTranscript ) {
122+ if ( / ( f a l l | s p r i n g | s u m m e r ) \s \d { 4 } / i. test ( line ) ) {
123+ currentSem = line ;
124+ if ( ! firstNonSummerSemReached && ! currentSem . includes ( "summer" ) ) {
125+ firstNonSummerSemReached = true ;
114126 }
115- }
116-
117- // SCRAPE COURSES (BY SEM)
118- let firstNonSummerSemReached = false ;
119-
120- let courseToSem : { [ key : string ] : string } = { } ;
121-
122- if ( textResult [ l ] . includes ( "institution credit" ) ) {
123- let truncatedTranscript = textResult . slice ( parseInt ( l ) + 1 ) ;
124- let currentSem = "" ;
125- for ( let line of truncatedTranscript ) {
126- if ( / ( f a l l | s p r i n g | s u m m e r ) \s \d { 4 } / i. test ( line ) ) {
127- currentSem = line ;
128- if ( ! firstNonSummerSemReached && ! currentSem . includes ( "summer" ) ) {
129- firstNonSummerSemReached = true ;
130- }
131- // Only start creating sems after first non-summer semester is reached
132- if ( firstNonSummerSemReached ) {
133- separatedCourses [ currentSem ] = [ ] ;
134- }
135- } else {
136- let courseMatch = line . match ( / \b \w + \s \d { 3 , 4 } \b / ) ;
137-
138- if ( courseMatch ) {
139- // Check if course didn't get an F or a W. If so, add to current sem or _TRAN
140- if ( ! ( line [ line . length - 1 ] == "f" || line [ line . length - 1 ] == "w" ) ) {
141- // TODO: We don't yet have a way to track courses that can be taken multiple times,
142- // so we store a course that appears multiple times only in the most recent semester it appears in.
143- if ( courseMatch [ 0 ] in courseToSem ) {
144- const prevSem = courseToSem [ courseMatch [ 0 ] ] ;
145- separatedCourses [ currentSem ] . push ( courseMatch [ 0 ] ) ;
146- courseToSem [ courseMatch [ 0 ] ] = currentSem ;
147- separatedCourses [ prevSem ] = separatedCourses [ prevSem ] . filter ( ( c : string ) => c !== courseMatch [ 0 ] ) ;
148- } else {
149- // Add all pre-college courses to _TRAN semester
150- if ( firstNonSummerSemReached ) {
151- separatedCourses [ currentSem ] . push ( courseMatch [ 0 ] ) ;
152- courseToSem [ courseMatch [ 0 ] ] = currentSem ;
153- } else {
154- separatedCourses [ "_TRAN" ] . push ( courseMatch [ 0 ] ) ;
155- courseToSem [ courseMatch [ 0 ] ] = "_TRAN" ;
156- }
157- }
127+ } else {
128+ let courseMatch = line . match ( / \b [ A - Z a - z ] { 2 , } \s \d { 3 , 4 } \b / ) ;
129+ if ( courseMatch ) {
130+ // Check if course didn't get an F or a W. If so, add to current sem or _TRAN
131+ if ( ! ( line [ line . length - 1 ] == "f" || line [ line . length - 1 ] == "w" ) ) {
132+ // TODO: We don't yet have a way to track courses that can be taken multiple times,
133+ // so we store a course that appears multiple times only in the most recent semester it appears in.
134+ if ( courseMatch [ 0 ] in courseToSem ) {
135+ courseToSem [ courseMatch [ 0 ] ] = currentSem ;
136+ } else {
137+ // Add all pre-college courses to _TRAN semester
138+ if ( firstNonSummerSemReached ) {
139+ courseToSem [ courseMatch [ 0 ] ] = currentSem ;
140+ } else {
141+ courseToSem [ courseMatch [ 0 ] ] = "_TRAN" ;
158142 }
159143 }
160144 }
161145 }
162-
163- // Remove any empty semesters (handles edge case where user fails/withdraws from all courses in a semester)
164- for ( let sem of Object . keys ( separatedCourses ) ) {
165- if ( separatedCourses [ sem ] . length == 0 ) {
166- delete separatedCourses [ sem ] ;
167- }
168- }
169-
170- separatedCourses = Object . keys ( separatedCourses ) . map (
171- ( key ) => ( { sem : key , courses : separatedCourses [ key ] } )
172- ) ;
173-
174- // SCRAPE START YEAR AND INFER GRAD YEAR
175- let years = separatedCourses . map ( ( e : any , i : number ) => {
176- return parseInt ( e . sem . replace ( / \D / g, "" ) ) ;
177- } ) ;
178- years . shift ( ) ;
179- startYear = Math . min ( ...years ) ;
180146 }
181147 }
148+ return courseToSem ;
149+ } ;
182150
183- // Match majors
184- let detectedMajorsOptions = [ ] ;
185-
186- let possibleDegrees = getMajorOptions ( degrees , tempSchools , startYear ) ;
151+ // Given a list of detected majors, a list of detected concentrations, and a list of possible degrees,
152+ // return a list of detected majors options.
153+ const detectMajors = (
154+ detectedMajors : string [ ] ,
155+ detectedConcentrations : string [ ] ,
156+ possibleDegrees : DegreeOption [ ] | undefined
157+ ) => {
158+ const detectedMajorsOptions = [ ] ;
187159 for ( let i in detectedMajors ) {
188160 let m =
189161 detectedMajors [ i ] +
@@ -206,8 +178,85 @@ export const parseTranscript = (
206178 if ( majorOption ) detectedMajorsOptions . push ( majorOption ) ;
207179 }
208180 }
181+
182+ return detectedMajorsOptions ;
183+ } ;
184+
185+ // Given a list of lines from the PDF and a list of possible degrees,
186+ // return a scraped information.
187+ export const parseTranscript = (
188+ textResult : string [ ] ,
189+ degrees : DegreeListing [ ] | undefined
190+ ) => {
191+ let courseToSem : { [ key : string ] : string } = { } ;
192+ let startYear : number = 0 ;
193+ let tempSchools : { value : string ; label : string } [ ] = [ ] ;
194+ let detectedMajors : string [ ] = [ ] ;
195+ let detectedConcentrations : string [ ] = [ ] ;
196+
197+ for ( let l = 0 ; l < textResult . length ; l ++ ) {
198+ if ( textResult [ l ] . replaceAll ( " " , "" ) . includes ( "program:" ) ) {
199+ tempSchools = tempSchools . concat ( checkSchool ( textResult , l ) ) ;
200+ }
201+
202+ if ( textResult [ l ] . includes ( "major" ) ) {
203+ detectedMajors . push ( textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ) ;
204+ }
205+
206+ if ( textResult [ l ] . includes ( "concentration" ) ) {
207+ detectedConcentrations . push ( textResult [ l ] . replace ( / ^ .* ?: \s * / , "" ) ) ;
208+ }
209+
210+ if ( textResult [ l ] . includes ( "transfer credit" ) ) {
211+ courseToSem = {
212+ ...courseToSem ,
213+ ...getAPAndTransferCourses ( textResult , l ) ,
214+ } ;
215+ }
216+
217+ if ( textResult [ l ] . includes ( "institution credit" ) ) {
218+ courseToSem = {
219+ ...courseToSem ,
220+ ...getCourseToSem ( textResult . slice ( l + 1 ) ) ,
221+ } ;
222+ }
223+ }
224+
225+ const separatedCourses = Object . entries ( courseToSem ) . reduce (
226+ ( acc , [ course , sem ] ) => {
227+ const trimmedSem = sem . trim ( ) ;
228+ if ( ! acc [ trimmedSem ] ) acc [ trimmedSem ] = [ ] ;
229+ acc [ trimmedSem ] . push ( course ) ;
230+ return acc ;
231+ } ,
232+ { } as { [ key : string ] : string [ ] }
233+ ) ;
234+
235+ const formattedSeparatedCourses = Object . entries ( separatedCourses ) . map (
236+ ( [ sem , courses ] ) => ( {
237+ sem,
238+ courses,
239+ } )
240+ ) ;
241+
242+ // Scrape start year and infer grad year
243+ let years = formattedSeparatedCourses . map (
244+ ( e : { sem : string ; courses : string [ ] } , i : number ) => {
245+ return parseInt ( e . sem . replace ( / \D / g, "" ) ) ;
246+ }
247+ ) ;
248+ years . shift ( ) ;
249+ startYear = Math . min ( ...years ) ;
250+
251+ let possibleDegrees = getMajorOptions ( degrees , tempSchools , startYear ) ;
252+ let detectedMajorsOptions = detectMajors (
253+ detectedMajors ,
254+ detectedConcentrations ,
255+ possibleDegrees
256+ ) ;
257+
209258 return {
210- scrapedCourses : separatedCourses ,
259+ scrapedCourses : formattedSeparatedCourses ,
211260 startYear : startYear ,
212261 scrapedSchools : tempSchools ,
213262 detectedMajorsOptions : detectedMajorsOptions ,
0 commit comments