@@ -4,54 +4,25 @@ import (
44 "encoding/base64"
55 "flag"
66 "fmt"
7- "github.com/gocolly/colly/v2"
7+ "github.com/PuerkitoBio/goquery"
8+ "net/http"
89 "net/url"
910 "os"
1011 "os/exec"
11- "regexp"
1212 "runtime"
13- "sort "
13+ "strconv "
1414 "strings"
15- "sync"
1615 "time"
1716)
1817
1918// cyclone's url spider
19+ // spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
2020// version 0.5.10; initial github release
21-
22- // global variables... I know...
23- var (
24- urlFlag string
25- crawlFlag int
26- oFlag string
27- phraseFlag int
28- cycloneFlag bool
29- versionFlag bool
30- wordList = make (map [string ]int )
31- wordListMu sync.Mutex
32- )
33-
34- // initilize flags
35- func init () {
36- flag .StringVar (& urlFlag , "url" , "" , "URL to scrape" )
37- flag .IntVar (& crawlFlag , "crawl" , 1 , "Depth to crawl links" )
38- flag .StringVar (& oFlag , "o" , "" , "Output file for word list" )
39- flag .IntVar (& phraseFlag , "phrase" , 1 , "Process pairs of words" )
40- flag .BoolVar (& cycloneFlag , "cyclone" , false , "" )
41- flag .BoolVar (& versionFlag , "version" , false , "Version number" )
42- flag .Parse ()
43-
44- // check for "http*" on urlFlag so gocolly doesn't wet the bed
45- u , err := url .Parse (urlFlag )
46- if err != nil {
47- fmt .Fprintf (os .Stderr , "Error parsing URL: %v\n " , err )
48- os .Exit (1 )
49- }
50- if u .Scheme == "" {
51- u .Scheme = "https"
52- urlFlag = u .String ()
53- }
54- }
21+ /* version 0.6.2;
22+ fixed scraping logic & ngram creations bugs
23+ switched from gocolly to goquery for web scraping
24+ remove dups from word / ngrams output
25+ */
5526
5627// clear screen function
5728func clearScreen () {
@@ -71,114 +42,173 @@ func clearScreen() {
7142 }
7243}
7344
74- // word processing logic
75- func processWords (text string , phrase int ) {
76- // acquire lock before accessing wordList
77- wordListMu .Lock ()
78- defer wordListMu .Unlock ()
45+ // goquery
46+ func getDocumentFromURL (targetURL string ) (* goquery.Document , error ) {
47+ client := & http.Client {}
48+ req , err := http .NewRequest ("GET" , targetURL , nil )
49+ if err != nil {
50+ return nil , err
51+ }
52+ req .Header .Set ("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" )
53+ res , err := client .Do (req )
54+ if err != nil {
55+ return nil , err
56+ }
57+ defer res .Body .Close ()
58+ return goquery .NewDocumentFromReader (res .Body )
59+ }
60+
61+ func getLinksFromDocument (doc * goquery.Document ) []string {
62+ var links []string
63+ doc .Find ("a[href]" ).Each (func (index int , item * goquery.Selection ) {
64+ linkTag := item
65+ link , _ := linkTag .Attr ("href" )
66+ links = append (links , link )
67+ })
68+ return links
69+ }
7970
80- wordRegex := regexp .MustCompile (`\w+` )
81- words := wordRegex .FindAllString (text , - 1 )
71+ func getTextFromDocument (doc * goquery.Document ) string {
72+ doc .Find ("script, style" ).Each (func (index int , item * goquery.Selection ) {
73+ item .Remove ()
74+ })
75+ return doc .Text ()
76+ }
8277
83- for i := 0 ; i < len (words ); i ++ {
84- if i + phrase <= len (words ) {
85- phraseWords := make ([]string , phrase )
86- for j := 0 ; j < phrase ; j ++ {
87- phraseWords [j ] = words [i + j ]
88- }
89- phraseStr := strings .Join (phraseWords , " " )
90- if _ , ok := wordList [phraseStr ]; ok {
91- wordList [phraseStr ]++
92- } else {
93- wordList [phraseStr ] = 1
94- }
95- } else {
96- word := words [i ]
97- if _ , ok := wordList [word ]; ok {
98- wordList [word ]++
99- } else {
100- wordList [word ] = 1
78+ func crawlAndScrape (u string , depth int , phrase int ) map [string ]bool {
79+ ngrams := make (map [string ]bool )
80+ doc , err := getDocumentFromURL (u )
81+ if err != nil {
82+ fmt .Println ("Error fetching URL:" , err )
83+ return ngrams
84+ }
85+ text := getTextFromDocument (doc )
86+ for _ , ngram := range generateNgrams (text , phrase ) {
87+ ngrams [ngram ] = true
88+ }
89+
90+ if depth > 1 {
91+ links := getLinksFromDocument (doc )
92+ for _ , link := range links [:depth - 1 ] {
93+ absoluteLink := joinURL (u , link )
94+ childNgrams := crawlAndScrape (absoluteLink , depth - 1 , phrase )
95+ for ngram := range childNgrams {
96+ ngrams [ngram ] = true
10197 }
10298 }
10399 }
100+
101+ return ngrams
104102}
105103
106- // save wordlist logic
107- func saveWordList (filename string ) {
108- // acquire lock before accessing wordList
109- wordListMu .Lock ()
110- defer wordListMu .Unlock ()
111- file , err := os .Create (filename )
104+ func joinURL (baseURL , relativeURL string ) string {
105+ u , err := url .Parse (baseURL )
112106 if err != nil {
113- fmt .Fprintf (os .Stderr , "Error creating output file: %v\n " , err )
114- os .Exit (1 )
107+ return ""
115108 }
116- defer file .Close ()
117-
118- type wordCount struct {
119- Word string
120- Count int
109+ newURL , err := u .Parse (relativeURL )
110+ if err != nil {
111+ return ""
121112 }
113+ return newURL .String ()
114+ }
122115
123- var counts []wordCount
124- for word , count := range wordList {
125- counts = append (counts , wordCount {Word : word , Count : count })
116+ func generateNgrams (text string , n int ) []string {
117+ words := strings .Fields (text )
118+ var ngrams []string
119+ for i := 0 ; i < len (words )- n + 1 ; i ++ {
120+ ngrams = append (ngrams , strings .Join (words [i :i + n ], " " ))
126121 }
122+ return ngrams
123+ }
127124
128- sort .Slice (counts , func (i , j int ) bool {
129- return counts [i ].Count > counts [j ].Count
130- })
125+ func uniqueStrings (str string ) map [string ]bool {
126+ words := strings .Fields (str )
127+ uniqueWords := make (map [string ]bool )
128+ for _ , word := range words {
129+ uniqueWords [word ] = true
130+ }
131+ return uniqueWords
132+ }
131133
132- for _ , wc := range counts {
133- _ , err := fmt .Fprintln (file , wc .Word )
134- if err != nil {
135- fmt .Fprintf (os .Stderr , "Error writing to output file: %v\n " , err )
136- os .Exit (1 )
137- }
134+ func uniqueStringsSlice (strs []string ) map [string ]bool {
135+ uniqueStrings := make (map [string ]bool )
136+ for _ , str := range strs {
137+ uniqueStrings [str ] = true
138138 }
139+ return uniqueStrings
139140}
140141
141142// main function
142143func main () {
143144 clearScreen ()
144145
145- if cycloneFlag {
146+ cycloneFlag := flag .Bool ("cyclone" , false , "Display coded message" )
147+ versionFlag := flag .Bool ("version" , false , "Display version" )
148+ urlFlag := flag .String ("url" , "" , "URL of the website to scrape" )
149+ ngramFlag := flag .String ("ngram" , "1" , "Lengths of n-grams (e.g., \" 1-3\" for 1, 2, and 3-length n-grams). Default: 1" )
150+ oFlag := flag .String ("o" , "" , "Output file for the n-grams" )
151+ crawlFlag := flag .Int ("crawl" , 1 , "Number of links to crawl (default: 1)" )
152+ flag .Parse ()
153+
154+ if * cycloneFlag {
146155 codedBy := "Q29kZWQgYnkgY3ljbG9uZSA7KQo="
147156 codedByDecoded , _ := base64 .StdEncoding .DecodeString (codedBy )
148157 fmt .Fprintln (os .Stderr , string (codedByDecoded ))
149158 os .Exit (0 )
150159 }
151160
152- if versionFlag {
153- version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNS4xMAo ="
161+ if * versionFlag {
162+ version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNi4yCg= ="
154163 versionDecoded , _ := base64 .StdEncoding .DecodeString (version )
155164 fmt .Fprintln (os .Stderr , string (versionDecoded ))
156165 os .Exit (0 )
157166 }
158167
159- if urlFlag == "" {
168+ if * urlFlag == "" {
160169 fmt .Fprintln (os .Stderr , "Error: -url flag is required" )
170+ fmt .Fprintln (os .Stderr , "Try running --help for more information" )
161171 os .Exit (1 )
162172 }
163173
164- if crawlFlag < 1 || crawlFlag > 100 {
165- fmt .Fprintln (os .Stderr , "Error: -crawl flag must be between 1 and 100 " )
174+ if * crawlFlag < 1 || * crawlFlag > 5 {
175+ fmt .Fprintln (os .Stderr , "Error: -crawl flag must be between 1 and 5 " )
166176 os .Exit (1 )
167177 }
168178
169- if phraseFlag < 1 || phraseFlag > 100 {
170- fmt .Fprintln (os .Stderr , "Error: -phrase flag must be between 1 and 100" )
179+ // check for "http*" on urlFlag so goquery doesn't wet the bed
180+ u , err := url .Parse (* urlFlag )
181+ if err != nil {
182+ fmt .Fprintf (os .Stderr , "Error parsing URL: %v\n " , err )
171183 os .Exit (1 )
172184 }
185+ if u .Scheme == "" {
186+ u .Scheme = "https"
187+ * urlFlag = u .String ()
188+ }
173189
174- if oFlag == "" {
175- parsedUrl , err := url .Parse (urlFlag )
190+ ngramRange := strings .Split (* ngramFlag , "-" )
191+ ngramMin , err := strconv .Atoi (ngramRange [0 ])
192+ if err != nil || ngramMin < 1 || ngramMin > 20 {
193+ fmt .Fprintln (os .Stderr , "Error: -ngram flag must be between 1 and 20" )
194+ os .Exit (1 )
195+ }
196+ ngramMax := ngramMin
197+ if len (ngramRange ) > 1 {
198+ ngramMax , err = strconv .Atoi (ngramRange [1 ])
199+ if err != nil || ngramMax < ngramMin || ngramMax > 20 {
200+ fmt .Fprintln (os .Stderr , "Error: -ngram flag must be between 1 and 20" )
201+ os .Exit (1 )
202+ }
203+ }
204+
205+ if * oFlag == "" {
206+ parsedUrl , err := url .Parse (* urlFlag )
176207 if err != nil {
177208 fmt .Fprintln (os .Stderr , "Error parsing URL" )
178209 os .Exit (1 )
179210 }
180- // default wordlist output if -oFlag is not specified
181- oFlag = strings .TrimPrefix (parsedUrl .Hostname (), "www." ) + "_wordlist.txt"
211+ * oFlag = strings .TrimPrefix (parsedUrl .Hostname (), "www." ) + "_wordlist.txt"
182212 }
183213
184214 start := time .Now ()
@@ -187,50 +217,48 @@ func main() {
187217 fmt .Fprintln (os .Stderr , "| Cyclone's URL Spider |" )
188218 fmt .Fprintln (os .Stderr , " ---------------------- " )
189219 fmt .Fprintln (os .Stderr )
190- fmt .Fprintf (os .Stderr , "Crawling URL:\t %s\n " , urlFlag )
191-
192- c := colly .NewCollector (
193- colly .MaxDepth (crawlFlag ),
194- colly .Async (true ),
195- )
196-
197- // initialize depth to crawlFlag
198- depth := crawlFlag
199-
200- // print crawl & depth info
201- fmt .Fprintf (os .Stderr , "Crawl Depth:\t %d\n " , depth )
202- fmt .Fprintf (os .Stderr , "Phrase Depth:\t %d\n " , phraseFlag )
203-
204- c .OnHTML ("a[href]" , func (e * colly.HTMLElement ) {
205- link := e .Request .AbsoluteURL (e .Attr ("href" ))
206- if link != "" && depth > 0 { // check if depth is greater than 0
207- depth -- // decrement depth after visiting a link
208- e .Request .Visit (link )
209- time .Sleep (250 * time .Millisecond ) // add short sleep time between requests to keep from being rate limited
220+ fmt .Fprintf (os .Stderr , "Crawling URL:\t %s\n " , * urlFlag )
221+ fmt .Fprintf (os .Stderr , "Crawl depth:\t %d\n " , * crawlFlag )
222+ fmt .Fprintf (os .Stderr , "ngram len:\t %s\n " , * ngramFlag )
223+
224+ ngrams := make (map [string ]bool )
225+ for i := ngramMin ; i <= ngramMax ; i ++ {
226+ for ngram := range crawlAndScrape (* urlFlag , * crawlFlag , i ) {
227+ ngrams [ngram ] = true
210228 }
211- })
212-
213- // only collect text from these elements using colly.HTML
214- c .OnHTML ("p, h1, h2, h3, h4, h5, h6, li" , func (e * colly.HTMLElement ) {
215- processWords (e .Text , phraseFlag )
216- })
229+ }
217230
218- c .OnScraped (func (r * colly.Response ) {
219- saveWordList (oFlag )
220- })
231+ // extract n-grams into a slice
232+ var ngramSlice []string
233+ for ngram := range ngrams {
234+ ngramSlice = append (ngramSlice , ngram )
235+ }
221236
222- err := c .Visit (urlFlag )
237+ // write unique n-grams to file
238+ file , err := os .Create (* oFlag )
223239 if err != nil {
224- fmt .Fprintf (os .Stderr , "Error crawling URL: %v\n " , err )
225- os .Exit (1 )
240+ fmt .Println ("Error creating file:" , err )
241+ return
242+ }
243+ defer file .Close ()
244+
245+ for _ , ngram := range ngramSlice {
246+ file .WriteString (ngram + "\n " )
226247 }
227248
228- c .Wait ()
249+ // calculate unique words
250+ uniqueWords := len (uniqueStrings (strings .Join (ngramSlice , " " )))
251+
252+ // calculate unique n-grams
253+ uniqueNgrams := len (ngramSlice )
254+
255+ runtime := time .Since (start )
229256
230- // print runtime results
231- fmt .Fprintf (os .Stderr , "Unique words:\t %d\n " , len (wordList ))
232- fmt .Fprintf (os .Stderr , "Wordlist:\t %s\n " , oFlag )
233- fmt .Fprintf (os .Stderr , "Runtime:\t %s\n " , time .Since (start ))
257+ // print statistics
258+ fmt .Fprintf (os .Stderr , "Unique words:\t %d\n " , uniqueWords )
259+ fmt .Fprintf (os .Stderr , "Unique ngrams:\t %d\n " , uniqueNgrams )
260+ fmt .Fprintf (os .Stderr , "Saved to:\t %s\n " , * oFlag )
261+ fmt .Fprintf (os .Stderr , "Runtime:\t %.6fs\n " , runtime .Seconds ())
234262}
235263
236264// end code
0 commit comments