@@ -36,23 +36,31 @@ version 0.7.0;
3636 fixed bug when attempting to crawl deeper than available URLs to crawl
3737 fixed crawl depth calculation
3838 optimized code which runs 2.8x faster vs v0.6.x during bench testing
39+ version 0.7.1;
40+ added progress bars to word / ngrams processing & file writing operations
41+ added RAM usage monitoring
42+ optimized order of operations for faster processing with less RAM
43+ TO-DO: refactor code (func main is getting messy)
3944*/
4045
4146// clear screen function
4247func clearScreen () {
48+ var cmd * exec.Cmd
49+
4350 switch runtime .GOOS {
44- case "linux" :
45- cmd := exec .Command ("clear" )
46- cmd .Stdout = os .Stdout
47- cmd .Run ()
48- case "darwin" :
49- cmd := exec .Command ("clear" )
50- cmd .Stdout = os .Stdout
51- cmd .Run ()
51+ case "linux" , "darwin" :
52+ cmd = exec .Command ("clear" )
5253 case "windows" :
53- cmd := exec .Command ("cmd" , "/c" , "cls" )
54- cmd .Stdout = os .Stdout
55- cmd .Run ()
54+ cmd = exec .Command ("cmd" , "/c" , "cls" )
55+ default :
56+ fmt .Fprintln (os .Stderr , "Unsupported platform" )
57+ os .Exit (1 )
58+ }
59+
60+ cmd .Stdout = os .Stdout
61+ if err := cmd .Run (); err != nil {
62+ fmt .Fprintf (os .Stderr , "Failed to clear screen: %v\n " , err )
63+ os .Exit (1 )
5664 }
5765}
5866
@@ -145,7 +153,7 @@ func crawlAndScrape(u string, depth int, delay int, urlCountChan chan<- int, tex
145153 absoluteLink := joinURL (u , link )
146154 linkDomain , err := getBaseDomain (absoluteLink )
147155 if err != nil {
148- fmt .Println ( "Error getting link domain:" , err )
156+ fmt .Fprintf ( os . Stderr , "Error getting link domain for %s: %v \n " , absoluteLink , err )
149157 continue
150158 }
151159 if linkDomain == baseDomain {
@@ -176,25 +184,38 @@ func joinURL(baseURL, relativeURL string) string {
176184 return newURL .String ()
177185}
178186
179- func generateNgrams (text string , n int ) []string {
180- words := strings .Fields (text )
181- if len (words ) < n {
182- return nil // return nil if not enough words for the n-gram
187+ func updateProgressBar (action string , total , processed int ) {
188+ if total == 0 {
189+ return // avoid division by zero
183190 }
184- var ngrams []string
185- for i := 0 ; i <= len (words )- n ; i ++ {
186- ngrams = append (ngrams , strings .Join (words [i :i + n ], " " ))
191+ percentage := float64 (processed ) / float64 (total ) * 100
192+ fmt .Printf ("\r %s...\t [" , action )
193+ for i := 0 ; i < int (percentage / 5 ); i ++ {
194+ fmt .Print ("=" )
187195 }
188- return ngrams
196+ for i := int (percentage / 5 ); i < 20 ; i ++ {
197+ fmt .Print (" " )
198+ }
199+ fmt .Printf ("] %.2f%%" , percentage )
189200}
190201
191- func uniqueStrings (str string ) map [string ]bool {
192- words := strings .Fields (str )
193- uniqueWords := make (map [string ]bool )
194- for _ , word := range words {
195- uniqueWords [word ] = true
202+ func monitorRAMUsage (stopChan chan bool , maxRAMUsage * float64 ) {
203+ var memStats runtime.MemStats
204+ ticker := time .NewTicker (100 * time .Millisecond )
205+ defer ticker .Stop ()
206+
207+ for {
208+ select {
209+ case <- ticker .C :
210+ runtime .ReadMemStats (& memStats )
211+ currentUsage := float64 (memStats .Alloc ) / 1024 / 1024 / 1024 // GB
212+ if currentUsage > * maxRAMUsage {
213+ * maxRAMUsage = currentUsage
214+ }
215+ case <- stopChan :
216+ return
217+ }
196218 }
197- return uniqueWords
198219}
199220
200221// main function
@@ -218,15 +239,15 @@ func main() {
218239 }
219240
220241 if * versionFlag {
221- version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4wCg== "
242+ version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNy4xLWJldGEK "
222243 versionDecoded , _ := base64 .StdEncoding .DecodeString (version )
223244 fmt .Fprintln (os .Stderr , string (versionDecoded ))
224245 os .Exit (0 )
225246 }
226247
227248 if * urlFlag == "" {
228249 fmt .Fprintln (os .Stderr , "Error: -url flag is required" )
229- fmt .Fprintln (os .Stderr , "Try running -- help for more information" )
250+ fmt .Fprintln (os .Stderr , "Try running -help for more information" )
230251 os .Exit (1 )
231252 }
232253
@@ -287,7 +308,7 @@ func main() {
287308 fmt .Fprintln (os .Stderr , " ---------------------- " )
288309 fmt .Fprintln (os .Stderr )
289310 fmt .Fprintf (os .Stderr , "Crawling URL:\t %s\n " , * urlFlag )
290- fmt .Fprintf (os .Stderr , "Base Domain :\t %s\n " , baseDomain )
311+ fmt .Fprintf (os .Stderr , "Base domain :\t %s\n " , baseDomain )
291312 fmt .Fprintf (os .Stderr , "Crawl depth:\t %d\n " , * crawlFlag )
292313 fmt .Fprintf (os .Stderr , "ngram len:\t %s\n " , * ngramFlag )
293314 fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase this to avoid rate limiting, ex: -delay 100)\n " , * delayFlag )
@@ -298,6 +319,11 @@ func main() {
298319 visitedURLs := make (map [string ]bool )
299320 doneChan := make (chan struct {})
300321 var wg sync.WaitGroup
322+ stopMonitor := make (chan bool )
323+ var maxRAMUsage float64
324+
325+ // start RAM usage monitor
326+ go monitorRAMUsage (stopMonitor , & maxRAMUsage )
301327
302328 // goroutine to print URLs crawled
303329 wg .Add (1 )
@@ -308,49 +334,81 @@ func main() {
308334 for {
309335 select {
310336 case <- ticker .C :
311- fmt .Fprintf (os .Stderr , "\r URLs Crawled :\t %d" , totalCrawled )
337+ fmt .Fprintf (os .Stderr , "\r URLs crawled :\t %d" , totalCrawled )
312338 case count := <- urlCountChan :
313339 totalCrawled += count
314340 case <- doneChan :
315- fmt .Fprintf (os .Stderr , "\r URLs Crawled :\t %d" , totalCrawled ) // final update
341+ fmt .Fprintf (os .Stderr , "\r URLs crawled :\t %d" , totalCrawled ) // final update
316342 return
317343 }
318344 }
319345 }()
320346
321347 // start crawling process in goroutine
348+ wg .Add (1 )
322349 go func () {
350+ defer wg .Done ()
323351 crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , urlCountChan , textsChan , visitedURLs )
324- close (textsChan ) // close channel after crawling is complete
325- }()
326-
327- // wait for crawling to complete
328- go func () {
329- wg .Wait ()
352+ time .Sleep (100 * time .Millisecond )
353+ close (textsChan )
330354 close (doneChan )
355+ fmt .Println ()
331356 }()
332357
333- // process the collected texts and generate n-grams
334- ngrams := make (map [string ]bool )
358+ // initialize maps for unique word and n-gram counting
359+ uniqueWordsMap := make (map [string ]bool )
360+ uniqueNgramsMap := make (map [string ]bool )
335361
336- if len (ngramRange ) > 1 {
337- ngramMax , _ = strconv .Atoi (ngramRange [1 ])
362+ // collect all texts into a slice
363+ var texts []string
364+ for text := range textsChan {
365+ texts = append (texts , text )
338366 }
367+ totalTexts := len (texts )
339368
340- for text := range textsChan {
341- for i := ngramMin ; i <= ngramMax ; i ++ {
342- for _ , ngram := range generateNgrams (text , i ) {
343- ngrams [ngram ] = true
369+ // set up progress bar ticker
370+ progressTicker := time .NewTicker (100 * time .Millisecond ) // update progress every 100ms
371+ defer progressTicker .Stop ()
372+ processedTexts := 0
373+
374+ // process texts and generate n-grams
375+ for _ , text := range texts {
376+ words := strings .Fields (text )
377+ for _ , word := range words {
378+ uniqueWordsMap [word ] = true // count unique words
379+ }
380+
381+ for i := 0 ; i <= len (words )- ngramMin ; i ++ {
382+ for n := ngramMin ; n <= ngramMax && i + n <= len (words ); n ++ {
383+ ngram := strings .Join (words [i :i + n ], " " )
384+ uniqueNgramsMap [ngram ] = true // count unique n-grams
344385 }
345386 }
387+
388+ processedTexts ++
389+ select {
390+ case <- progressTicker .C :
391+ updateProgressBar ("Processing" , totalTexts , processedTexts )
392+ default :
393+ // continue without blocking if ticker channel is not ready
394+ }
346395 }
347396
348- // extract n-grams into a slice
397+ // final update to progress bar output
398+ updateProgressBar ("Processing" , totalTexts , processedTexts )
399+
400+ // convert unique n-grams map back to a slice for writing to file
349401 var ngramSlice []string
350- for ngram := range ngrams {
402+ for ngram := range uniqueNgramsMap {
351403 ngramSlice = append (ngramSlice , ngram )
352404 }
353405
406+ // calculated counts
407+ uniqueWords := len (uniqueWordsMap )
408+ uniqueNgrams := len (uniqueNgramsMap )
409+ fmt .Fprintf (os .Stderr , "\n Unique words:\t %d\n " , uniqueWords )
410+ fmt .Fprintf (os .Stderr , "Unique ngrams:\t %d\n " , uniqueNgrams )
411+
354412 // write unique n-grams to file
355413 file , err := os .Create (* oFlag )
356414 if err != nil {
@@ -360,29 +418,43 @@ func main() {
360418 defer file .Close ()
361419
362420 writer := bufio .NewWriterSize (file , 1 * 1024 * 1024 ) // 1MB buffer for better write performance
363- for _ , ngram := range ngramSlice {
421+ totalNgrams := len (ngramSlice )
422+
423+ // progress update interval
424+ progressUpdateInterval := totalNgrams / 100
425+ if progressUpdateInterval == 0 {
426+ progressUpdateInterval = 1
427+ }
428+
429+ var memStats runtime.MemStats
430+ runtime .ReadMemStats (& memStats )
431+
432+ for i , ngram := range ngramSlice {
364433 _ , err := writer .WriteString (ngram + "\n " )
365434 if err != nil {
366435 fmt .Println ("Error writing to buffer:" , err )
367436 return
368437 }
438+ if i % progressUpdateInterval == 0 {
439+ updateProgressBar ("Writing" , totalNgrams , i + 1 ) // update write progress bar
440+ }
369441 }
442+
370443 err = writer .Flush ()
371444 if err != nil {
372445 fmt .Println ("Error flushing buffer to file:" , err )
373446 return
374447 }
448+ updateProgressBar ("Writing" , totalNgrams , totalNgrams ) // final update to write progress bar
375449
376- // calculate unique words and n-grams
377- uniqueWords := len (uniqueStrings (strings .Join (ngramSlice , " " )))
378- uniqueNgrams := len (ngramSlice )
450+ // stop RAM monitoring
451+ stopMonitor <- true
379452
380453 // print statistics
381- runtime := time .Since (start )
382- fmt .Fprintf (os .Stderr , "\n Unique words:\t %d\n " , uniqueWords )
383- fmt .Fprintf (os .Stderr , "Unique ngrams:\t %d\n " , uniqueNgrams )
384- fmt .Fprintf (os .Stderr , "Saved to:\t %s\n " , * oFlag )
385- fmt .Fprintf (os .Stderr , "Runtime:\t %.3fs\n " , runtime .Seconds ())
454+ fmt .Fprintf (os .Stderr , "\n Output file:\t %s\n " , * oFlag )
455+ fmt .Fprintf (os .Stderr , "RAM used:\t %.2f GB\n " , maxRAMUsage )
456+ runTime := time .Since (start )
457+ fmt .Fprintf (os .Stderr , "Runtime:\t %.3fs\n " , runTime .Seconds ())
386458}
387459
388- // end code
460+ // end code
0 commit comments