-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.go
279 lines (247 loc) · 7.85 KB
/
scrape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
package main
import (
"encoding/json"
"fmt"
"golang.org/x/net/html"
"io/ioutil"
"log"
"net/http"
"os"
//"reflect"
"regexp"
"strconv"
"strings"
"time"
)
type Option struct {
code string
name string
}
type Attribute struct {
name string
value string
}
type USAirNetMapStationData struct {
Code string `json:"code"`
Name string `json:"name"`
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
}
type USAirNetMapStateData struct {
Code string `json:"code"`
Name string `json:"name"`
Stations map[string]*USAirNetMapStationData `json:"stations"`
}
type USAirNetMapData struct {
States map[string]*USAirNetMapStateData `json:"states"`
}
var data USAirNetMapData
func getSelectOptions(node *html.Node) []Option {
options := make([]Option, 0, 1000)
for c := node.FirstChild; c != nil; c = c.NextSibling {
if c.Data == "option" {
var val string
for _, a := range c.Attr {
if a.Key == "value" {
val = a.Val
}
}
if val != "" {
options = append(options, Option{val, c.FirstChild.Data})
}
}
}
return options
}
func getElement(node *html.Node, tag string, attrib Attribute) (*html.Node, bool) {
if node.Data == tag {
for _, a := range node.Attr {
if a.Key == attrib.name && a.Val == attrib.value {
return node, false
}
}
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
subnode, fail := getElement(c, tag, attrib)
if !fail {
return subnode, false
}
}
return nil, true
}
func getHTML(url string) string {
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
if response.StatusCode != http.StatusOK {
log.Fatal(err)
}
bodyBytes, err := ioutil.ReadAll(response.Body)
defer response.Body.Close()
return string(bodyBytes)
}
func getNodeFromUrl(url string) *html.Node {
body := getHTML(url)
//fmt.Println(body)
root, err := html.Parse(strings.NewReader(body))
if err != nil {
log.Fatal(err)
}
return root
}
func scrapeStation(state string, station string) {
url := "http://www.usairnet.com/cgi-bin/launch/code.cgi?Submit=Go&state=" + state + "&sta=" + station
//fmt.Println(url)
//body := getHTML(url)
//fmt.Println(body)
root := getNodeFromUrl(url)
// get the heading that contains the station title
headingSpan, fail := getElement(root, "span", Attribute{"class", "bolder"})
if fail {
log.Fatal("scrapeStation(" + state + ", " + station + ") failed to get span class bolder")
}
headingStrong := headingSpan.FirstChild
headingText := headingStrong.FirstChild.Data
//fmt.Printf("headingText: %s\n", headingText)
headingPrefix := "Aviation Weather Forecast at "
if !strings.Contains(headingText, headingPrefix) {
log.Fatal("Heaing text (" + headingText + ") doesn't contain expected prefix (" + headingPrefix + ")")
}
stationText := strings.Replace(headingText, headingPrefix, "", -1)
//fmt.Printf("stationText: %s\n", stationText)
//stationText = "Franklin, Somewhere, Pennsylvania"
// intentionally split by comma and space, then rejoin for the title
stationTextParts := strings.Split(stationText, ", ")
stationName := strings.Join(stationTextParts[0:len(stationTextParts)-1], ", ")
stateName := stationTextParts[len(stationTextParts)-1]
//fmt.Printf("stationName: %s\n", stationName)
//fmt.Printf("stateName: %s\n", stateName)
// get the details line, and extract out station code (for confirmation) and coords
detailSpan, fail := getElement(root, "span", Attribute{"class", "norm2"})
if fail { log.Fatal(fail) }
text := detailSpan.FirstChild.Data
spaces := regexp.MustCompile(` +`)
tokens := spaces.Split(text, -1)
if tokens[0] != "Station:" { log.Fatal("Error finding 'Station:' in " + text) }
if tokens[1] != station { log.Fatal("Scraped station code " + tokens[1] + ", expected " + station) }
// all USA should be North/West, so just assert them
if tokens[3] != "North:" { log.Fatal("Error finding 'North:' name in " + text) }
north, err := strconv.ParseFloat(tokens[4], 32)
if err != nil { log.Fatal(err) }
if tokens[6] != "West:" {
log.Fatal("Error finding 'West:' name in " + text)
}
west, err := strconv.ParseFloat(tokens[7], 32)
if err != nil { log.Fatal(err) }
// write scraped information to <data>
//- ensure states map exists
if data.States == nil { data.States = map[string]*USAirNetMapStateData{} }
//- ensure stateData object exists
stateData := data.States[state]
if stateData == nil {
data.States[state] = &USAirNetMapStateData{}
stateData = data.States[state]
}
//- assign state code if not yet assigned
if stateData.Code == "" { stateData.Code = state }
//- assign state name if first time, or confirm
if stateData.Name == "" {
stateData.Name = stateName
fmt.Println(stateName)
} else {
if stateData.Name != stateName {
log.Fatal("scraped state name " + stateName + " not equal to existing state name " + stateData.Name)
}
}
//- ensure stations is not nil
if stateData.Stations == nil { stateData.Stations = map[string]*USAirNetMapStationData{} }
//- ensure stationData object exists
stationData := stateData.Stations[station]
if stationData == nil {
stateData.Stations[station] = &USAirNetMapStationData{}
stationData = stateData.Stations[station]
}
//- assign station code if not yet assigned
if stationData.Code == "" { stationData.Code = station }
//- assign station name if first time, or confirm
if stationData.Name == "" {
stationData.Name = stationName
fmt.Println(stationName)
} else {
if stationData.Name != stationName {
log.Fatal("scraped station name " + stationName + " not equal to existing station name " + stationData.Name)
}
}
//- assign values for North/West without checking
stationData.Latitude = north
stationData.Longitude = west
fmt.Printf("%s Station %s was successfully scraped\n", state, station)
fmt.Printf("- Name: %s\n", stationName)
fmt.Printf("- Latitude: %f North\n", north)
fmt.Printf("- Longitude: %f West\n", west)
fmt.Printf("Saving...")
fileData, _ := json.MarshalIndent(data, "","\t")
_ = ioutil.WriteFile("data.json", fileData, 0644)
fmt.Printf(" done!\n")
}
func scrapeState(state string) {
url := "http://www.usairnet.com/cgi-bin/launch/code.cgi?Submit=Go&state=" + state
root := getNodeFromUrl(url)
sel, fail := getElement(root, "select", Attribute{"name", "sta"})
if fail {
log.Fatal("scrapeState(" + state + ") failed to get 'select' element named 'sta'")
}
stations := getSelectOptions(sel)
//fmt.Println(len(stations))
for i, station := range stations {
fmt.Printf("station %d: %s - %s\n", i, station.code, station.name)
scrapeStation(state, station.code)
fmt.Printf("\nSleeping for 2secs to give website a break... ")
time.Sleep(2 * time.Second)
fmt.Printf("ok let's go again!\n\n")
}
fmt.Printf("Done with %s\n", state)
}
func scrapeUSA() {
url := "http://www.usairnet.com/cgi-bin/launch/code.cgi"
root := getNodeFromUrl(url)
sel, fail := getElement(root, "select", Attribute{"name", "state"})
if fail {
log.Fatal("scrapeUSA() failed to get 'select' element named 'state'")
}
states := getSelectOptions(sel)
//fmt.Println(len(states))
for i, state := range states {
fmt.Printf("state %d: %s - %s\n", i, state.code, state.name)
scrapeState(state.code)
fmt.Printf("\nSleeping for 2secs to give website a break... ")
time.Sleep(2 * time.Second)
fmt.Printf("ok let's go again!\n\n")
}
}
func main() {
args := os.Args
//fmt.Println(reflect.TypeOf(args))
//fmt.Println(len(args))
file, _ := ioutil.ReadFile("data.json")
json.Unmarshal([]byte(file), &data)
/*
states := data.States
for state, stateData := range states {
fmt.Printf("states[%s]: %s\n", state, stateData.Name)
for station, stationData := range stateData.Stations {
fmt.Printf("states[%s].stations[%s]: %s\n", state, station, stationData.Name)
}
}
*/
if len(args) == 1 {
scrapeUSA()
} else if len(args) == 2 {
scrapeState(args[1])
} else if len(args) == 3 {
scrapeStation(args[1], args[2])
} else {
fmt.Println("Usage: go run scrape.go [state] [station]")
}
}