-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.go
253 lines (230 loc) · 6.56 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
package main
import (
"flag"
"fmt" //GO’s base package
"io"
"net/http" //for sending HTTP requests
"net/url" //for URL formatting
"strings" //string manipulation and testing
"sync" //for thread safe map
"sync/atomic" //for thread safe map
"time"
"github.com/PuerkitoBio/goquery"
log "github.com/romana/rlog"
)
//URL filter function definition
type filterFunc func(string, Crawler) bool
//Our crawler structure definition
type Crawler struct {
//the base URL of the website being crawled
host string
//a channel on which the crawler will receive new (unfiltered) URLs to crawl
//the crawler will pass everything received from this channel
//through the chain of filters we have
//and only allowed URLs will be passed to the filteredUrls channel
urls chan string
//a channel on which the crawler will receive filtered URLs.
filteredUrls chan string
// channel for adding webpages
webpages chan Webpage
// channel on which we will be sending quit signal
quit chan string
//a slice that contains the filters we want to apply on the URLs.
filters []filterFunc
// Depth of links to visit
depth sync.Map
// Visited urls
visited sync.Map
// Politeness delay for crawler in seconds
politeness int
// max depth to parse
maxdepth int
//an integer to track how many URLs have been crawled
count int
// count of threads in processing
processing int32
}
//starts the crawler
//the method starts two GO functions
//the first one waits for new URLs as they
//get extracted.
//the second waits for filtered URLs as they
//pass through all the registered filters
func (crawler *Crawler) start(wsite *website) {
//wait for new URLs to be extracted and passed to the URLs channel.
go func() {
for {
select {
case url := <-crawler.urls:
atomic.AddInt32(&crawler.processing, 1)
go crawler.filter(url)
case <-crawler.quit:
log.Debugf("> Closing urls channel")
close(crawler.urls)
return
}
}
}()
//wait for filtered URLs to arrive through the filteredUrls channel
go func() {
for {
select {
case url := <-crawler.filteredUrls:
crawler.count++
// log.Debugf("%d: Crawling %s ", crawler.count, url)
atomic.AddInt32(&crawler.processing, 1)
wpage := Webpage{url, nil}
go crawler.crawl(&wpage)
log.Debugf("Waiting for %d second before next requests", crawler.politeness)
time.Sleep(time.Duration(crawler.politeness) * time.Second)
case <-crawler.quit:
log.Debugf("> Closing filteredUrls channel")
close(crawler.filteredUrls)
return
}
}
}()
// Collect website objects from crawling
go func() {
for {
select {
case wpage := <-crawler.webpages:
wsite.AddWebpage(wpage)
log.Infof("%d: Added %s", crawler.count, wpage.URL)
case <-crawler.quit:
close(crawler.webpages)
return
}
}
}()
}
//given a URL, the method will apply all the filters
//on that URL, if and only if, it passes through all
//the filters, it will then be passed to the filteredUrls channel
func (crawler *Crawler) filter(url string) {
defer func() { atomic.AddInt32(&crawler.processing, -1) }()
temp := false
for _, fn := range crawler.filters {
temp = fn(url, *crawler)
if temp != true {
return
}
}
atomic.AddInt32(&crawler.processing, 1)
go func() {
defer func() { atomic.AddInt32(&crawler.processing, -1) }()
crawler.filteredUrls <- url
}()
}
//given a URL, the method will send an HTTP GET request
//extract the response body
//extract the URLs from the body
func (crawler *Crawler) crawl(wpage *Webpage) {
defer func() { crawler.processing += -1 }()
url := wpage.URL
depth, _ := crawler.depth.Load(url)
visited, _ := crawler.visited.Load(url)
if !visited.(bool) && depth.(int) <= crawler.maxdepth {
resp, err := http.Get(url) //here we make call to url
if err != nil {
log.Debug("An error has occured")
log.Debug(err)
} else {
defer resp.Body.Close()
if err != nil {
log.Debug("Error while fetching body for : " + url)
log.Debug(err)
} else {
crawler.extractUrls(wpage, resp.Body)
log.Debugf("References for %s are %d", wpage.URL, len(wpage.References))
crawler.visited.Store(url, true)
crawler.webpages <- *wpage
}
}
} else {
log.Debugf("For %s: Depth : %d and visited : %t", url, depth, visited)
}
return
}
func (crawler *Crawler) extractUrls(wpage *Webpage, body io.ReadCloser) {
doc, err := goquery.NewDocumentFromReader(body)
Url := wpage.URL
baseURL, _ := url.Parse(Url)
if err != nil {
log.Debugf("Error parsing goquery: %s", Url)
log.Debug(err)
}
doc.Find("body a").Each(func(i int, s *goquery.Selection) {
raw_href, ok := s.Attr("href")
if ok {
raw_href = strings.Split(raw_href, "#")[0]
raw_href = strings.TrimRight(raw_href, "/")
href, _ := url.Parse(raw_href)
// Resolve the relative urls
if strings.HasPrefix(raw_href, "/") ||
strings.HasPrefix(raw_href, ".") {
href = baseURL.ResolveReference(href)
}
wpage.References = append(wpage.References, href.String())
_, visited := crawler.visited.Load(href.String())
if !visited {
crawler.visited.Store(href.String(), false)
depth, _ := crawler.depth.Load(Url)
crawler.depth.Store(href.String(), depth.(int)+1)
atomic.AddInt32(&crawler.processing, 1)
go func() {
defer func() { atomic.AddInt32(&crawler.processing, -1) }()
crawler.urls <- href.String()
}()
}
}
})
}
//adds a new URL filter to the crawler
func (crawler *Crawler) addFilter(filter filterFunc) {
crawler.filters = append(crawler.filters, filter)
}
func main() {
var maxdepth int
var startURL string
flag.StringVar(&startURL, "u", "http://sumit.murari.me", "URL to start crawling")
flag.IntVar(&maxdepth, "d", 2, "Max depth for crawler for")
flag.Parse()
wsite := website{startURL, nil}
c := Crawler{
startURL,
make(chan string, 10),
make(chan string, 10),
make(chan Webpage, 10),
make(chan string),
make([]filterFunc, 0),
sync.Map{},
sync.Map{},
2,
maxdepth,
0,
0,
}
c.addFilter(IsInternal)
c.addFilter(IsValidPath)
c.addFilter(IsValidSubdomain)
c.depth.Store(startURL, 0)
c.visited.Store(startURL, false)
c.start(&wsite)
c.urls <- c.host
for {
log.Debugf("urls queue: %d ; filteredUrls queue: %d; processing: %d ", len(c.urls), len(c.filteredUrls), c.processing)
time.Sleep(2 * time.Second)
if len(c.filteredUrls) == 0 && len(c.urls) == 0 && c.processing == 0 {
log.Debugf("urls and filteredUrls channels and no url in processing")
c.quit <- "done"
c.quit <- "done"
c.quit <- "done"
break
}
}
fmt.Println("Printing website")
wsite.PrintBasicSiteMap()
wsite.PrintSiteGraph()
fmt.Println("Good bye")
}