-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.rb
46 lines (39 loc) · 1017 Bytes
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/env ruby
# encoding: utf-8
require 'scraperwiki'
require 'nokogiri'
require 'pry'
require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end
def noko_for(url)
Nokogiri::HTML(open(url).read)
end
def scrape_list(url)
noko = noko_for(url)
noko.css('form select[name="list"] option').drop(1).each do |opt|
link = URI.join url, URI.escape(opt.attr('value'))
data = {
name: opt.text.tidy,
party: "None",
party_id: "na",
term: 6,
source: url.to_s,
}.merge( scrape_person(link) )
ScraperWiki.save_sqlite([:name, :term], data)
end
end
def scrape_person(url)
noko = (noko_for(url) rescue nil) or return {}
data = {
image: noko.css('div.innertext img[src*="/cv/"]/@src').text,
source: url.to_s,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
data
end
scrape_list('http://www.shura.gov.sa/wps/wcm/connect/shuraen/internet/cv')