-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathassociations.rb
executable file
·105 lines (92 loc) · 3.65 KB
/
associations.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'yaml'
require 'net/https'
require 'uri'
require 'open-uri'
require 'csv'
def get_redirect(uri)
url = URI.parse(uri)
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
res = http.start {|http|
http.head(url.path)
}
res['location']
end
def is_200?(uri)
url = URI.parse(uri)
res = Net::HTTP.start(url.host, url.port) {|http|
http.head(url.path)
}
$stderr.puts "#{res.code} for #{uri}"
res.code == '200'
end
doc = Nokogiri::HTML(open('http://www.edonnelly.com/loebs.html'))
associations = {}
doc.xpath('//a[contains(@href,"books.google.com") or contains(@href,"www.archive.org") or (@href = ".")]').each do |link|
loeb = link.xpath('preceding::a[contains(@href,"hup.harvard.edu")][2]').first
title = loeb.xpath('following::td[1]').first.content
original_title = loeb.xpath('following::td[1]/following::i[1]').first.content
/Original (?<original_year>\d+) Title/ =~ loeb.xpath('following::td[1]/following::i[1]/parent::td[1]').first.content
# $stderr.puts [loeb, title, original_title, link['href']].join(',')
author = title.split(' -- ').first
title = title.split(' -- ').last
if author =~ /,/ # only modern authors have a comma in the name
author = ''
end
loeb = loeb.content
unless associations.has_key? loeb
associations[loeb] = {}
associations[loeb]['author'] = author
associations[loeb]['original_title'] = original_title
associations[loeb]['original_year'] = original_year
if title != original_title
if original_title.nil? || original_title.empty?
associations[loeb]['original_title'] = title
else
associations[loeb]['new_title'] = title
end
end
if is_200?("http://ryanfb.github.io/loebolus-data/#{loeb}.pdf")
associations[loeb]['in_loebolus'] = true
else
associations[loeb]['in_loebolus'] = false
end
end
if link['href'] =~ /www.archive.org/
associations[loeb]['archive'] = link['href']
id = link['href'].split('/').last
# associations[loeb]['openlibrary'] = get_redirect("https://openlibrary.org/ia/#{id}")
elsif link['href'] != "."
associations[loeb]['google'] = link['href']
end
end
# puts associations.to_yaml
CSV.open('loeb-copyright-old.csv', "wb") do |csv|
csv << %w{identifier author title year_published pre_1923 1923-1963_copyright_not_renewed in_loebolus notes urls}
associations.each_key do |volume|
urls = [associations[volume]['archive'], associations[volume]['google']].join(' ').strip
notes = ''
if associations[volume]['new_title']
notes = "New title: #{associations[volume]['new_title']}"
end
pre_1923 = associations[volume]['original_year'] && (associations[volume]['original_year'].to_i < 1923)
not_renewed = (associations[volume]['original_year'] &&
(associations[volume]['original_year'].to_i >= 1923) &&
(associations[volume]['original_year'].to_i <= 1963) &&
associations[volume]['in_loebolus']) || ''
if associations[volume]['in_loebolus'] && associations[volume]['original_year'] && (associations[volume]['original_year'].to_i > 1963)
$stderr.puts "COPYRIGHT WARNING: #{volume}"
end
if associations[volume]['in_loebolus'] && !associations[volume]['original_year']
$stderr.puts "NEEDS_YEAR: #{volume}"
end
if pre_1923 && !associations[volume]['in_loebolus']
$stderr.puts "NEEDS_PDF: #{volume}"
end
csv << [volume, associations[volume]['author'], associations[volume]['original_title'], associations[volume]['original_year'], pre_1923, not_renewed, associations[volume]['in_loebolus'], notes, urls]
end
end