88 "net/http"
99 "net/url"
1010 "os"
11- "os/exec "
11+ "path "
1212 "path/filepath"
1313 "runtime"
1414 "sort"
@@ -23,12 +23,13 @@ import (
2323/*
2424 cyclone's url spider
2525 spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
26- version 0.5.10; initial github release
27- version 0.6.2;
26+ v0.5.10;
27+ initial github release
28+ v0.6.2;
2829 fixed scraping logic & ngram creations bugs
2930 switched from gocolly to goquery for web scraping
3031 remove dups from word / ngrams output
31- version 0 .7.0;
32+ v0 .7.0;
3233 added feature to allow crawling specific file extensions (html, htm, txt)
3334 added check to keep crawler from crawling offsite URLs
3435 added flag "-delay" to avoid rate limiting (-delay 100 == 100ms delay between URL requests)
@@ -38,20 +39,32 @@ version 0.7.0;
3839 fixed bug when attempting to crawl deeper than available URLs to crawl
3940 fixed crawl depth calculation
4041 optimized code which runs 2.8x faster vs v0.6.x during bench testing
41- version 0 .7.1;
42+ v0 .7.1;
4243 added progress bars to word / ngrams processing & file writing operations
4344 added RAM usage monitoring
4445 optimized order of operations for faster processing with less RAM
4546 TO-DO: refactor code (func main is getting messy)
46- TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt
47+ TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt (COMPLETED in v0.8.0)
4748v0.8.0;
4849 added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)
4950 added flag "-timeout" for -url mode
5051 added flag "-sort" which sorts output by frequency
5152 fixed several small bugs
53+ v0.8.1;
54+ updated default -delay to 10ms
55+ v0.9.0;
56+ added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
57+ added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
58+ exit early if zero URLs were crawled (no processing or file output)
59+ use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
60+ removed clearScreen function and its imports
61+ fixed crawl-depth calculation logic
62+ fixed restrict link collection to .html, .htm, .txt and extension-less paths
63+ upgraded dependencies and bumped Go version to v1.24.3
5264*/
5365
5466// clear screen function
67+ /*
5568func clearScreen() {
5669 var cmd *exec.Cmd
5770
@@ -71,6 +84,7 @@ func clearScreen() {
7184 os.Exit(1)
7285 }
7386}
87+ */
7488
7589// goquery
7690func getDocumentFromURL (targetURL string , timeout time.Duration ) (* goquery.Document , bool , error ) {
@@ -79,7 +93,7 @@ func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Docum
7993 if err != nil {
8094 return nil , false , err
8195 }
82- req .Header .Set ("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 " )
96+ req .Header .Set ("User-Agent" , "Spider/0.9.0 (+https://github.com/cyclone-github/spider) " )
8397
8498 res , err := client .Do (req )
8599 if err != nil {
@@ -106,16 +120,23 @@ func hasAnySuffix(s string, suffixes []string) bool {
106120
107121func getLinksFromDocument (doc * goquery.Document , baseURL string ) []string {
108122 var links []string
109- validSuffixes := []string {".html" , ".htm" , ".txt" } // specifically crawl file types, ex: if listed in a file server
110-
111- doc .Find ("a[href]" ).Each (func (index int , item * goquery.Selection ) {
112- link , exists := item .Attr ("href" )
113- if exists {
114- absoluteLink := joinURL (baseURL , link ) // convert to absolute URL
115- // crawl any non-anchor or valid-file-type link
116- if hasAnySuffix (link , validSuffixes ) || ! strings .HasPrefix (link , "#" ) {
117- links = append (links , absoluteLink )
118- }
123+ validSuffixes := map [string ]bool {
124+ ".html" : true ,
125+ ".htm" : true ,
126+ ".txt" : true ,
127+ }
128+
129+ doc .Find ("a[href]" ).Each (func (_ int , item * goquery.Selection ) {
130+ href , exists := item .Attr ("href" )
131+ if ! exists || strings .HasPrefix (href , "#" ) {
132+ return
133+ }
134+ absoluteLink := joinURL (baseURL , href )
135+
136+ // only allow approved extensions or none at all
137+ ext := strings .ToLower (path .Ext (absoluteLink ))
138+ if ext == "" || validSuffixes [ext ] {
139+ links = append (links , absoluteLink )
119140 }
120141 })
121142 return links
@@ -128,7 +149,7 @@ func getTextFromDocument(doc *goquery.Document) string {
128149 return doc .Text ()
129150}
130151
131- func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool ) {
152+ func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool , urlMatchStr string ) {
132153 if visited [u ] {
133154 return
134155 }
@@ -142,28 +163,37 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
142163 if ! isSuccess {
143164 return
144165 }
145- urlCountChan <- 1 // URL processed
146166
147- text := getTextFromDocument (doc )
148- textsChan <- text // send the text for later n-gram processing
167+ // only count & scrape text if it contains -url-match
168+ if urlMatchStr == "" || strings .Contains (strings .ToLower (u ), urlMatchStr ) {
169+ urlCountChan <- 1 // URL processed
170+ textsChan <- getTextFromDocument (doc ) // send the text for later n-gram processing
171+ }
149172
150173 if depth > 1 {
151174 baseDomain , err := getBaseDomain (u )
152175 if err != nil {
153176 fmt .Fprintf (os .Stderr , "Error getting base domain: %v\n " , err )
154177 return
155178 }
156- links := getLinksFromDocument (doc , u )
157- for _ , link := range links {
179+ for _ , link := range getLinksFromDocument (doc , u ) {
158180 time .Sleep (time .Duration (delay ) * time .Millisecond )
181+
159182 linkDomain , err := getBaseDomain (link )
160183 if err != nil {
161184 fmt .Fprintf (os .Stderr , "Error parsing link %s: %v\n " , link , err )
162185 continue
163186 }
164- if linkDomain == baseDomain {
165- crawlAndScrape (link , depth - 1 , delay , timeout , urlCountChan , textsChan , visited )
187+ if linkDomain != baseDomain {
188+ continue
189+ }
190+
191+ // only *descend* into children that match (if urlMatchStr was provided)
192+ if urlMatchStr != "" && ! strings .Contains (strings .ToLower (link ), urlMatchStr ) {
193+ continue
166194 }
195+
196+ crawlAndScrape (link , depth - 1 , delay , timeout , urlCountChan , textsChan , visited , urlMatchStr )
167197 }
168198 }
169199}
@@ -225,7 +255,7 @@ func monitorRAMUsage(stopChan chan bool, maxRAMUsage *float64) {
225255
226256// main function
227257func main () {
228- clearScreen ()
258+ // clearScreen()
229259
230260 cycloneFlag := flag .Bool ("cyclone" , false , "Display coded message" )
231261 versionFlag := flag .Bool ("version" , false , "Display version" )
@@ -234,9 +264,10 @@ func main() {
234264 ngramFlag := flag .String ("ngram" , "1" , "Lengths of n-grams (e.g., \" 1-3\" for 1, 2, and 3-length n-grams)." )
235265 oFlag := flag .String ("o" , "" , "Output file for the n-grams" )
236266 crawlFlag := flag .Int ("crawl" , 1 , "Depth of links to crawl" )
237- delayFlag := flag .Int ("delay" , 0 , "Delay in ms between each URL lookup to avoid rate limiting" )
267+ delayFlag := flag .Int ("delay" , 10 , "Delay in ms between each URL lookup to avoid rate limiting" )
238268 timeoutFlag := flag .Int ("timeout" , 1 , "Timeout for URL crawling in seconds" )
239269 sortFlag := flag .Bool ("sort" , false , "Sort output by frequency" )
270+ urlMatchFlag := flag .String ("url-match" , "" , "Only crawl URLs containing this keyword (case-insensitive)" )
240271 flag .Parse ()
241272
242273 if * cycloneFlag {
@@ -246,7 +277,7 @@ func main() {
246277 os .Exit (0 )
247278 }
248279 if * versionFlag {
249- version := "Cyclone's URL Spider v0.8 .0"
280+ version := "Cyclone's URL Spider v0.9 .0"
250281 fmt .Fprintln (os .Stderr , version )
251282 os .Exit (0 )
252283 }
@@ -259,6 +290,8 @@ func main() {
259290 }
260291 fileMode := * fileFlag != ""
261292
293+ urlMatchStr := strings .ToLower (* urlMatchFlag )
294+
262295 var baseDomain string
263296 if ! fileMode {
264297 // URL mode
@@ -333,7 +366,7 @@ func main() {
333366 fmt .Fprintf (os .Stderr , "Base domain:\t %s\n " , baseDomain )
334367 fmt .Fprintf (os .Stderr , "Crawl depth:\t %d\n " , * crawlFlag )
335368 fmt .Fprintf (os .Stderr , "ngram len:\t %s\n " , * ngramFlag )
336- fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase this to avoid rate limiting)\n " , * delayFlag )
369+ fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase to avoid rate limiting)\n " , * delayFlag )
337370 fmt .Fprintf (os .Stderr , "Timeout:\t %d sec\n " , * timeoutFlag )
338371 }
339372
@@ -370,15 +403,17 @@ func main() {
370403 defer wg .Done ()
371404 ticker := time .NewTicker (50 * time .Millisecond )
372405 defer ticker .Stop ()
373- totalCrawled := 1
406+ totalCrawled := 0
374407 for {
375408 select {
376409 case <- ticker .C :
377410 fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
378411 case count := <- urlCountChan :
379412 totalCrawled += count
380413 case <- doneChan :
381- fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
414+ if totalCrawled > 0 {
415+ fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
416+ }
382417 return
383418 }
384419 }
@@ -388,7 +423,7 @@ func main() {
388423 wg .Add (1 )
389424 go func () {
390425 defer wg .Done ()
391- crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , urlCountChan , textsChan , visitedURLs )
426+ crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , urlCountChan , textsChan , visitedURLs , urlMatchStr )
392427 time .Sleep (100 * time .Millisecond )
393428 close (textsChan )
394429 close (doneChan )
@@ -401,6 +436,17 @@ func main() {
401436 for text := range textsChan {
402437 texts = append (texts , text )
403438 }
439+
440+ // if nothing matched, exit early
441+ if len (texts ) == 0 {
442+ time .Sleep (100 )
443+ fmt .Fprintln (os .Stderr , "No URLs crawled, exiting..." ) // boo, something went wrong!
444+ if * crawlFlag == 1 {
445+ fmt .Fprintln (os .Stderr , "Try increasing -crawl depth, or remove -url-match" )
446+ }
447+ return
448+ }
449+
404450 totalTexts := len (texts )
405451
406452 // set up progress bar ticker
0 commit comments