Merge pull request #7 from cyclone-github/dev

cyclone-github · web-flow · commit 159eed566d2c · 2025-05-12T17:19:29.000-05:00
v0.9.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,18 @@
+### v0.9.0
+```
+added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
+added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
+exit early if zero URLs were crawled (no processing or file output)
+use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
+removed clearScreen function and its imports
+fixed crawl-depth calculation logic
+fixed restrict link collection to .html, .htm, .txt and extension-less paths
+upgraded dependencies and bumped Go version to v1.24.3
+```
+### v0.8.1
+```
+updated default -delay to 10ms
+```
 ### v0.8.0
 ```
 added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 # Spider: URL Mode
 ```
-spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -o forum.hashpwn.net_spider.txt
+spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -url-match wordlist -o forum.hashpwn.net_spider.txt
 ```
 ```
  ---------------------- 
@@ -21,15 +21,15 @@ Crawl depth:    2
 ngram len:      1-3
 Crawl delay:    20ms (increase this to avoid rate limiting)
 Timeout:        1 sec
-URLs crawled:   56
+URLs crawled:   2
 Processing...   [====================] 100.00%
-Unique words:   3164
-Unique ngrams:  17313
+Unique words:   475
+Unique ngrams:  1977
 Sorting n-grams by frequency...
 Writing...      [====================] 100.00%
 Output file:    forum.hashpwn.net_spider.txt
-RAM used:       0.03 GB
-Runtime:        8.634s
+RAM used:       0.02 GB
+Runtime:        2.283s
 ```
 # Spider: File Mode
 ```
@@ -66,10 +66,37 @@ Wordlist & ngram creation tool to crawl a given url or process a local file to c
   - `spider -url 'https://github.com/cyclone-github' -timeout 2`
 - To create ngrams len 1-3 and sort output by frequency, use "-ngram 1-3" "-sort"
   - `spider -url 'https://github.com/cyclone-github' -ngram 1-3 -sort`
+- To filter crawled URLs by keyword "foobar"
+  - `spider -url 'https://github.com/cyclone-github' -url-match foobar`
 - To process a local text file, create ngrams len 1-3 and sort output by frequency
   - `spider -file foobar.txt -ngram 1-3 -sort`
 - Run `spider -help` to see a list of all options
 
+### spider -help
+```
+  -crawl int
+        Depth of links to crawl (default 1)
+  -cyclone
+        Display coded message
+  -delay int
+        Delay in ms between each URL lookup to avoid rate limiting (default 10)
+  -file string
+        Path to a local file to scrape
+  -url-match string
+        Only crawl URLs containing this keyword (case-insensitive)
+  -ngram string
+        Lengths of n-grams (e.g., "1-3" for 1, 2, and 3-length n-grams). (default "1")
+  -o string
+        Output file for the n-grams
+  -sort
+        Sort output by frequency
+  -timeout int
+        Timeout for URL crawling in seconds (default 1)
+  -url string
+        URL of the website to scrape
+  -version
+        Display version
+```
 ### Compile from source:
 - If you want the latest features, compiling from source is the best option since the release version may run several revisions behind the source code.
 - This assumes you have Go and Git installed
diff --git a/go.mod b/go.mod
@@ -1,10 +1,10 @@
 module spider
 
-go 1.24.1
+go 1.24.3
 
 require github.com/PuerkitoBio/goquery v1.10.3
 
 require (
 	github.com/andybalholm/cascadia v1.3.3 // indirect
-	golang.org/x/net v0.39.0 // indirect
+	golang.org/x/net v0.40.0 // indirect
 )
diff --git a/go.sum b/go.sum
@@ -24,8 +24,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
 golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
 golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
-golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
-golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
+golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY=
+golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
diff --git a/spider.go b/spider.go
@@ -8,7 +8,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"os/exec"
+	"path"
 	"path/filepath"
 	"runtime"
 	"sort"
@@ -23,12 +23,13 @@ import (
 /*
    cyclone's url spider
    spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
-version 0.5.10; initial github release
-version 0.6.2;
+v0.5.10;
+	initial github release
+v0.6.2;
    fixed scraping logic & ngram creations bugs
    switched from gocolly to goquery for web scraping
    remove dups from word / ngrams output
-version 0.7.0;
+v0.7.0;
    added feature to allow crawling specific file extensions (html, htm, txt)
    added check to keep crawler from crawling offsite URLs
    added flag "-delay" to avoid rate limiting (-delay 100 == 100ms delay between URL requests)
@@ -38,20 +39,32 @@ version 0.7.0;
    fixed bug when attempting to crawl deeper than available URLs to crawl
    fixed crawl depth calculation
    optimized code which runs 2.8x faster vs v0.6.x during bench testing
-version 0.7.1;
+v0.7.1;
     added progress bars to word / ngrams processing & file writing operations
     added RAM usage monitoring
     optimized order of operations for faster processing with less RAM
     TO-DO: refactor code (func main is getting messy)
-    TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt
+    TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt (COMPLETED in v0.8.0)
 v0.8.0;
     added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)
     added flag "-timeout" for -url mode
     added flag "-sort" which sorts output by frequency
     fixed several small bugs
+v0.8.1;
+	updated default -delay to 10ms
+v0.9.0;
+	added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
+	added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
+	exit early if zero URLs were crawled (no processing or file output)
+	use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
+	removed clearScreen function and its imports
+	fixed crawl-depth calculation logic
+	fixed restrict link collection to .html, .htm, .txt and extension-less paths
+	upgraded dependencies and bumped Go version to v1.24.3
 */
 
 // clear screen function
+/*
 func clearScreen() {
 	var cmd *exec.Cmd
 
@@ -71,6 +84,7 @@ func clearScreen() {
 		os.Exit(1)
 	}
 }
+*/
 
 // goquery
 func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Document, bool, error) {
@@ -79,7 +93,7 @@ func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Docum
 	if err != nil {
 		return nil, false, err
 	}
-	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
+	req.Header.Set("User-Agent", "Spider/0.9.0 (+https://github.com/cyclone-github/spider)")
 
 	res, err := client.Do(req)
 	if err != nil {
@@ -106,16 +120,23 @@ func hasAnySuffix(s string, suffixes []string) bool {
 
 func getLinksFromDocument(doc *goquery.Document, baseURL string) []string {
 	var links []string
-	validSuffixes := []string{".html", ".htm", ".txt"} // specifically crawl file types, ex: if listed in a file server
-
-	doc.Find("a[href]").Each(func(index int, item *goquery.Selection) {
-		link, exists := item.Attr("href")
-		if exists {
-			absoluteLink := joinURL(baseURL, link) // convert to absolute URL
-			// crawl any non-anchor or valid-file-type link
-			if hasAnySuffix(link, validSuffixes) || !strings.HasPrefix(link, "#") {
-				links = append(links, absoluteLink)
-			}
+	validSuffixes := map[string]bool{
+		".html": true,
+		".htm":  true,
+		".txt":  true,
+	}
+
+	doc.Find("a[href]").Each(func(_ int, item *goquery.Selection) {
+		href, exists := item.Attr("href")
+		if !exists || strings.HasPrefix(href, "#") {
+			return
+		}
+		absoluteLink := joinURL(baseURL, href)
+
+		// only allow approved extensions or none at all
+		ext := strings.ToLower(path.Ext(absoluteLink))
+		if ext == "" || validSuffixes[ext] {
+			links = append(links, absoluteLink)
 		}
 	})
 	return links
@@ -128,7 +149,7 @@ func getTextFromDocument(doc *goquery.Document) string {
 	return doc.Text()
 }
 
-func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool) {
+func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool, urlMatchStr string) {
 	if visited[u] {
 		return
 	}
@@ -142,28 +163,37 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
 	if !isSuccess {
 		return
 	}
-	urlCountChan <- 1 // URL processed
 
-	text := getTextFromDocument(doc)
-	textsChan <- text // send the text for later n-gram processing
+	// only count & scrape text if it contains -url-match
+	if urlMatchStr == "" || strings.Contains(strings.ToLower(u), urlMatchStr) {
+		urlCountChan <- 1                     // URL processed
+		textsChan <- getTextFromDocument(doc) // send the text for later n-gram processing
+	}
 
 	if depth > 1 {
 		baseDomain, err := getBaseDomain(u)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error getting base domain: %v\n", err)
 			return
 		}
-		links := getLinksFromDocument(doc, u)
-		for _, link := range links {
+		for _, link := range getLinksFromDocument(doc, u) {
 			time.Sleep(time.Duration(delay) * time.Millisecond)
+
 			linkDomain, err := getBaseDomain(link)
 			if err != nil {
 				fmt.Fprintf(os.Stderr, "Error parsing link %s: %v\n", link, err)
 				continue
 			}
-			if linkDomain == baseDomain {
-				crawlAndScrape(link, depth-1, delay, timeout, urlCountChan, textsChan, visited)
+			if linkDomain != baseDomain {
+				continue
+			}
+
+			// only *descend* into children that match (if urlMatchStr was provided)
+			if urlMatchStr != "" && !strings.Contains(strings.ToLower(link), urlMatchStr) {
+				continue
 			}
+
+			crawlAndScrape(link, depth-1, delay, timeout, urlCountChan, textsChan, visited, urlMatchStr)
 		}
 	}
 }
@@ -225,7 +255,7 @@ func monitorRAMUsage(stopChan chan bool, maxRAMUsage *float64) {
 
 // main function
 func main() {
-	clearScreen()
+	//clearScreen()
 
 	cycloneFlag := flag.Bool("cyclone", false, "Display coded message")
 	versionFlag := flag.Bool("version", false, "Display version")
@@ -234,9 +264,10 @@ func main() {
 	ngramFlag := flag.String("ngram", "1", "Lengths of n-grams (e.g., \"1-3\" for 1, 2, and 3-length n-grams).")
 	oFlag := flag.String("o", "", "Output file for the n-grams")
 	crawlFlag := flag.Int("crawl", 1, "Depth of links to crawl")
-	delayFlag := flag.Int("delay", 0, "Delay in ms between each URL lookup to avoid rate limiting")
+	delayFlag := flag.Int("delay", 10, "Delay in ms between each URL lookup to avoid rate limiting")
 	timeoutFlag := flag.Int("timeout", 1, "Timeout for URL crawling in seconds")
 	sortFlag := flag.Bool("sort", false, "Sort output by frequency")
+	urlMatchFlag := flag.String("url-match", "", "Only crawl URLs containing this keyword (case-insensitive)")
 	flag.Parse()
 
 	if *cycloneFlag {
@@ -246,7 +277,7 @@ func main() {
 		os.Exit(0)
 	}
 	if *versionFlag {
-		version := "Cyclone's URL Spider v0.8.0"
+		version := "Cyclone's URL Spider v0.9.0"
 		fmt.Fprintln(os.Stderr, version)
 		os.Exit(0)
 	}
@@ -259,6 +290,8 @@ func main() {
 	}
 	fileMode := *fileFlag != ""
 
+	urlMatchStr := strings.ToLower(*urlMatchFlag)
+
 	var baseDomain string
 	if !fileMode {
 		// URL mode
@@ -333,7 +366,7 @@ func main() {
 		fmt.Fprintf(os.Stderr, "Base domain:\t%s\n", baseDomain)
 		fmt.Fprintf(os.Stderr, "Crawl depth:\t%d\n", *crawlFlag)
 		fmt.Fprintf(os.Stderr, "ngram len:\t%s\n", *ngramFlag)
-		fmt.Fprintf(os.Stderr, "Crawl delay:\t%dms (increase this to avoid rate limiting)\n", *delayFlag)
+		fmt.Fprintf(os.Stderr, "Crawl delay:\t%dms (increase to avoid rate limiting)\n", *delayFlag)
 		fmt.Fprintf(os.Stderr, "Timeout:\t%d sec\n", *timeoutFlag)
 	}
 
@@ -370,15 +403,17 @@ func main() {
 			defer wg.Done()
 			ticker := time.NewTicker(50 * time.Millisecond)
 			defer ticker.Stop()
-			totalCrawled := 1
+			totalCrawled := 0
 			for {
 				select {
 				case <-ticker.C:
 					fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
 				case count := <-urlCountChan:
 					totalCrawled += count
 				case <-doneChan:
-					fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
+					if totalCrawled > 0 {
+						fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
+					}
 					return
 				}
 			}
@@ -388,7 +423,7 @@ func main() {
 		wg.Add(1)
 		go func() {
 			defer wg.Done()
-			crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, urlCountChan, textsChan, visitedURLs)
+			crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, urlCountChan, textsChan, visitedURLs, urlMatchStr)
 			time.Sleep(100 * time.Millisecond)
 			close(textsChan)
 			close(doneChan)
@@ -401,6 +436,17 @@ func main() {
 	for text := range textsChan {
 		texts = append(texts, text)
 	}
+
+	// if nothing matched, exit early
+	if len(texts) == 0 {
+		time.Sleep(100)
+		fmt.Fprintln(os.Stderr, "No URLs crawled, exiting...") // boo, something went wrong!
+		if *crawlFlag == 1 {
+			fmt.Fprintln(os.Stderr, "Try increasing -crawl depth, or remove -url-match")
+		}
+		return
+	}
+
 	totalTexts := len(texts)
 
 	// set up progress bar ticker