Skip to content

Commit 0098289

Browse files
v0.6.2
fixed scraping logic & ngram creations bugs switched from gocolly to goquery for web scraping remove dups from word / ngrams output
1 parent d793e87 commit 0098289

File tree

1 file changed

+165
-137
lines changed

1 file changed

+165
-137
lines changed

spider.go

Lines changed: 165 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,25 @@ import (
44
"encoding/base64"
55
"flag"
66
"fmt"
7-
"github.com/gocolly/colly/v2"
7+
"github.com/PuerkitoBio/goquery"
8+
"net/http"
89
"net/url"
910
"os"
1011
"os/exec"
11-
"regexp"
1212
"runtime"
13-
"sort"
13+
"strconv"
1414
"strings"
15-
"sync"
1615
"time"
1716
)
1817

1918
// cyclone's url spider
19+
// spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
2020
// version 0.5.10; initial github release
21-
22-
// global variables... I know...
23-
var (
24-
urlFlag string
25-
crawlFlag int
26-
oFlag string
27-
phraseFlag int
28-
cycloneFlag bool
29-
versionFlag bool
30-
wordList = make(map[string]int)
31-
wordListMu sync.Mutex
32-
)
33-
34-
// initilize flags
35-
func init() {
36-
flag.StringVar(&urlFlag, "url", "", "URL to scrape")
37-
flag.IntVar(&crawlFlag, "crawl", 1, "Depth to crawl links")
38-
flag.StringVar(&oFlag, "o", "", "Output file for word list")
39-
flag.IntVar(&phraseFlag, "phrase", 1, "Process pairs of words")
40-
flag.BoolVar(&cycloneFlag, "cyclone", false, "")
41-
flag.BoolVar(&versionFlag, "version", false, "Version number")
42-
flag.Parse()
43-
44-
// check for "http*" on urlFlag so gocolly doesn't wet the bed
45-
u, err := url.Parse(urlFlag)
46-
if err != nil {
47-
fmt.Fprintf(os.Stderr, "Error parsing URL: %v\n", err)
48-
os.Exit(1)
49-
}
50-
if u.Scheme == "" {
51-
u.Scheme = "https"
52-
urlFlag = u.String()
53-
}
54-
}
21+
/* version 0.6.2;
22+
fixed scraping logic & ngram creations bugs
23+
switched from gocolly to goquery for web scraping
24+
remove dups from word / ngrams output
25+
*/
5526

5627
// clear screen function
5728
func clearScreen() {
@@ -71,114 +42,173 @@ func clearScreen() {
7142
}
7243
}
7344

74-
// word processing logic
75-
func processWords(text string, phrase int) {
76-
// acquire lock before accessing wordList
77-
wordListMu.Lock()
78-
defer wordListMu.Unlock()
45+
// goquery
46+
func getDocumentFromURL(targetURL string) (*goquery.Document, error) {
47+
client := &http.Client{}
48+
req, err := http.NewRequest("GET", targetURL, nil)
49+
if err != nil {
50+
return nil, err
51+
}
52+
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
53+
res, err := client.Do(req)
54+
if err != nil {
55+
return nil, err
56+
}
57+
defer res.Body.Close()
58+
return goquery.NewDocumentFromReader(res.Body)
59+
}
60+
61+
func getLinksFromDocument(doc *goquery.Document) []string {
62+
var links []string
63+
doc.Find("a[href]").Each(func(index int, item *goquery.Selection) {
64+
linkTag := item
65+
link, _ := linkTag.Attr("href")
66+
links = append(links, link)
67+
})
68+
return links
69+
}
7970

80-
wordRegex := regexp.MustCompile(`\w+`)
81-
words := wordRegex.FindAllString(text, -1)
71+
func getTextFromDocument(doc *goquery.Document) string {
72+
doc.Find("script, style").Each(func(index int, item *goquery.Selection) {
73+
item.Remove()
74+
})
75+
return doc.Text()
76+
}
8277

83-
for i := 0; i < len(words); i++ {
84-
if i+phrase <= len(words) {
85-
phraseWords := make([]string, phrase)
86-
for j := 0; j < phrase; j++ {
87-
phraseWords[j] = words[i+j]
88-
}
89-
phraseStr := strings.Join(phraseWords, " ")
90-
if _, ok := wordList[phraseStr]; ok {
91-
wordList[phraseStr]++
92-
} else {
93-
wordList[phraseStr] = 1
94-
}
95-
} else {
96-
word := words[i]
97-
if _, ok := wordList[word]; ok {
98-
wordList[word]++
99-
} else {
100-
wordList[word] = 1
78+
func crawlAndScrape(u string, depth int, phrase int) map[string]bool {
79+
ngrams := make(map[string]bool)
80+
doc, err := getDocumentFromURL(u)
81+
if err != nil {
82+
fmt.Println("Error fetching URL:", err)
83+
return ngrams
84+
}
85+
text := getTextFromDocument(doc)
86+
for _, ngram := range generateNgrams(text, phrase) {
87+
ngrams[ngram] = true
88+
}
89+
90+
if depth > 1 {
91+
links := getLinksFromDocument(doc)
92+
for _, link := range links[:depth-1] {
93+
absoluteLink := joinURL(u, link)
94+
childNgrams := crawlAndScrape(absoluteLink, depth-1, phrase)
95+
for ngram := range childNgrams {
96+
ngrams[ngram] = true
10197
}
10298
}
10399
}
100+
101+
return ngrams
104102
}
105103

106-
// save wordlist logic
107-
func saveWordList(filename string) {
108-
// acquire lock before accessing wordList
109-
wordListMu.Lock()
110-
defer wordListMu.Unlock()
111-
file, err := os.Create(filename)
104+
func joinURL(baseURL, relativeURL string) string {
105+
u, err := url.Parse(baseURL)
112106
if err != nil {
113-
fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err)
114-
os.Exit(1)
107+
return ""
115108
}
116-
defer file.Close()
117-
118-
type wordCount struct {
119-
Word string
120-
Count int
109+
newURL, err := u.Parse(relativeURL)
110+
if err != nil {
111+
return ""
121112
}
113+
return newURL.String()
114+
}
122115

123-
var counts []wordCount
124-
for word, count := range wordList {
125-
counts = append(counts, wordCount{Word: word, Count: count})
116+
func generateNgrams(text string, n int) []string {
117+
words := strings.Fields(text)
118+
var ngrams []string
119+
for i := 0; i < len(words)-n+1; i++ {
120+
ngrams = append(ngrams, strings.Join(words[i:i+n], " "))
126121
}
122+
return ngrams
123+
}
127124

128-
sort.Slice(counts, func(i, j int) bool {
129-
return counts[i].Count > counts[j].Count
130-
})
125+
func uniqueStrings(str string) map[string]bool {
126+
words := strings.Fields(str)
127+
uniqueWords := make(map[string]bool)
128+
for _, word := range words {
129+
uniqueWords[word] = true
130+
}
131+
return uniqueWords
132+
}
131133

132-
for _, wc := range counts {
133-
_, err := fmt.Fprintln(file, wc.Word)
134-
if err != nil {
135-
fmt.Fprintf(os.Stderr, "Error writing to output file: %v\n", err)
136-
os.Exit(1)
137-
}
134+
func uniqueStringsSlice(strs []string) map[string]bool {
135+
uniqueStrings := make(map[string]bool)
136+
for _, str := range strs {
137+
uniqueStrings[str] = true
138138
}
139+
return uniqueStrings
139140
}
140141

141142
// main function
142143
func main() {
143144
clearScreen()
144145

145-
if cycloneFlag {
146+
cycloneFlag := flag.Bool("cyclone", false, "Display coded message")
147+
versionFlag := flag.Bool("version", false, "Display version")
148+
urlFlag := flag.String("url", "", "URL of the website to scrape")
149+
ngramFlag := flag.String("ngram", "1", "Lengths of n-grams (e.g., \"1-3\" for 1, 2, and 3-length n-grams). Default: 1")
150+
oFlag := flag.String("o", "", "Output file for the n-grams")
151+
crawlFlag := flag.Int("crawl", 1, "Number of links to crawl (default: 1)")
152+
flag.Parse()
153+
154+
if *cycloneFlag {
146155
codedBy := "Q29kZWQgYnkgY3ljbG9uZSA7KQo="
147156
codedByDecoded, _ := base64.StdEncoding.DecodeString(codedBy)
148157
fmt.Fprintln(os.Stderr, string(codedByDecoded))
149158
os.Exit(0)
150159
}
151160

152-
if versionFlag {
153-
version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNS4xMAo="
161+
if *versionFlag {
162+
version := "Q3ljbG9uZSdzIFVSTCBTcGlkZXIgdjAuNi4yCg=="
154163
versionDecoded, _ := base64.StdEncoding.DecodeString(version)
155164
fmt.Fprintln(os.Stderr, string(versionDecoded))
156165
os.Exit(0)
157166
}
158167

159-
if urlFlag == "" {
168+
if *urlFlag == "" {
160169
fmt.Fprintln(os.Stderr, "Error: -url flag is required")
170+
fmt.Fprintln(os.Stderr, "Try running --help for more information")
161171
os.Exit(1)
162172
}
163173

164-
if crawlFlag < 1 || crawlFlag > 100 {
165-
fmt.Fprintln(os.Stderr, "Error: -crawl flag must be between 1 and 100")
174+
if *crawlFlag < 1 || *crawlFlag > 5 {
175+
fmt.Fprintln(os.Stderr, "Error: -crawl flag must be between 1 and 5")
166176
os.Exit(1)
167177
}
168178

169-
if phraseFlag < 1 || phraseFlag > 100 {
170-
fmt.Fprintln(os.Stderr, "Error: -phrase flag must be between 1 and 100")
179+
// check for "http*" on urlFlag so goquery doesn't wet the bed
180+
u, err := url.Parse(*urlFlag)
181+
if err != nil {
182+
fmt.Fprintf(os.Stderr, "Error parsing URL: %v\n", err)
171183
os.Exit(1)
172184
}
185+
if u.Scheme == "" {
186+
u.Scheme = "https"
187+
*urlFlag = u.String()
188+
}
173189

174-
if oFlag == "" {
175-
parsedUrl, err := url.Parse(urlFlag)
190+
ngramRange := strings.Split(*ngramFlag, "-")
191+
ngramMin, err := strconv.Atoi(ngramRange[0])
192+
if err != nil || ngramMin < 1 || ngramMin > 20 {
193+
fmt.Fprintln(os.Stderr, "Error: -ngram flag must be between 1 and 20")
194+
os.Exit(1)
195+
}
196+
ngramMax := ngramMin
197+
if len(ngramRange) > 1 {
198+
ngramMax, err = strconv.Atoi(ngramRange[1])
199+
if err != nil || ngramMax < ngramMin || ngramMax > 20 {
200+
fmt.Fprintln(os.Stderr, "Error: -ngram flag must be between 1 and 20")
201+
os.Exit(1)
202+
}
203+
}
204+
205+
if *oFlag == "" {
206+
parsedUrl, err := url.Parse(*urlFlag)
176207
if err != nil {
177208
fmt.Fprintln(os.Stderr, "Error parsing URL")
178209
os.Exit(1)
179210
}
180-
// default wordlist output if -oFlag is not specified
181-
oFlag = strings.TrimPrefix(parsedUrl.Hostname(), "www.") + "_wordlist.txt"
211+
*oFlag = strings.TrimPrefix(parsedUrl.Hostname(), "www.") + "_wordlist.txt"
182212
}
183213

184214
start := time.Now()
@@ -187,50 +217,48 @@ func main() {
187217
fmt.Fprintln(os.Stderr, "| Cyclone's URL Spider |")
188218
fmt.Fprintln(os.Stderr, " ---------------------- ")
189219
fmt.Fprintln(os.Stderr)
190-
fmt.Fprintf(os.Stderr, "Crawling URL:\t%s\n", urlFlag)
191-
192-
c := colly.NewCollector(
193-
colly.MaxDepth(crawlFlag),
194-
colly.Async(true),
195-
)
196-
197-
// initialize depth to crawlFlag
198-
depth := crawlFlag
199-
200-
// print crawl & depth info
201-
fmt.Fprintf(os.Stderr, "Crawl Depth:\t%d\n", depth)
202-
fmt.Fprintf(os.Stderr, "Phrase Depth:\t%d\n", phraseFlag)
203-
204-
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
205-
link := e.Request.AbsoluteURL(e.Attr("href"))
206-
if link != "" && depth > 0 { // check if depth is greater than 0
207-
depth-- // decrement depth after visiting a link
208-
e.Request.Visit(link)
209-
time.Sleep(250 * time.Millisecond) // add short sleep time between requests to keep from being rate limited
220+
fmt.Fprintf(os.Stderr, "Crawling URL:\t%s\n", *urlFlag)
221+
fmt.Fprintf(os.Stderr, "Crawl depth:\t%d\n", *crawlFlag)
222+
fmt.Fprintf(os.Stderr, "ngram len:\t%s\n", *ngramFlag)
223+
224+
ngrams := make(map[string]bool)
225+
for i := ngramMin; i <= ngramMax; i++ {
226+
for ngram := range crawlAndScrape(*urlFlag, *crawlFlag, i) {
227+
ngrams[ngram] = true
210228
}
211-
})
212-
213-
// only collect text from these elements using colly.HTML
214-
c.OnHTML("p, h1, h2, h3, h4, h5, h6, li", func(e *colly.HTMLElement) {
215-
processWords(e.Text, phraseFlag)
216-
})
229+
}
217230

218-
c.OnScraped(func(r *colly.Response) {
219-
saveWordList(oFlag)
220-
})
231+
// extract n-grams into a slice
232+
var ngramSlice []string
233+
for ngram := range ngrams {
234+
ngramSlice = append(ngramSlice, ngram)
235+
}
221236

222-
err := c.Visit(urlFlag)
237+
// write unique n-grams to file
238+
file, err := os.Create(*oFlag)
223239
if err != nil {
224-
fmt.Fprintf(os.Stderr, "Error crawling URL: %v\n", err)
225-
os.Exit(1)
240+
fmt.Println("Error creating file:", err)
241+
return
242+
}
243+
defer file.Close()
244+
245+
for _, ngram := range ngramSlice {
246+
file.WriteString(ngram + "\n")
226247
}
227248

228-
c.Wait()
249+
// calculate unique words
250+
uniqueWords := len(uniqueStrings(strings.Join(ngramSlice, " ")))
251+
252+
// calculate unique n-grams
253+
uniqueNgrams := len(ngramSlice)
254+
255+
runtime := time.Since(start)
229256

230-
// print runtime results
231-
fmt.Fprintf(os.Stderr, "Unique words:\t%d\n", len(wordList))
232-
fmt.Fprintf(os.Stderr, "Wordlist:\t%s\n", oFlag)
233-
fmt.Fprintf(os.Stderr, "Runtime:\t%s\n", time.Since(start))
257+
// print statistics
258+
fmt.Fprintf(os.Stderr, "Unique words:\t%d\n", uniqueWords)
259+
fmt.Fprintf(os.Stderr, "Unique ngrams:\t%d\n", uniqueNgrams)
260+
fmt.Fprintf(os.Stderr, "Saved to:\t%s\n", *oFlag)
261+
fmt.Fprintf(os.Stderr, "Runtime:\t%.6fs\n", runtime.Seconds())
234262
}
235263

236264
// end code

0 commit comments

Comments
 (0)