forked from traject/traject
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_config.rb
155 lines (115 loc) · 6.51 KB
/
demo_config.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# A sample traject configration, save as say `traject_config.rb`, then
# run `traject -c traject_config.rb marc_file.marc` to index to
# solr specified in config file, according to rules specified in
# config file
# To have access to various built-in logic
# for pulling things out of MARC21, like `marc_languages`
require 'traject/macros/marc21_semantics'
extend Traject::Macros::Marc21Semantics
# To have access to the traject marc format/carrier classifier
require 'traject/macros/marc_format_classifier'
extend Traject::Macros::MarcFormats
# In this case for simplicity we provide all our settings, including
# solr connection details, in this one file. But you could choose
# to separate them into antoher config file; divide things between
# files however you like, you can call traject with as many
# config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
settings do
provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
end
# Extract first 001, then supply code block to add "bib_" prefix to it
to_field "id", extract_marc("001", :first => true) do |marc_record, accumulator, context|
accumulator.collect! {|s| "bib_#{s}"}
end
# An exact literal string, always this string:
to_field "source", literal("traject_test_last")
to_field "marc_display", serialized_marc(:format => "binary", :binary_escape => false, :allow_oversized => true)
to_field "text", extract_all_marc_values
to_field "text_extra_boost_t", extract_marc("505art")
to_field "publisher_t", extract_marc("260abef:261abef:262ab:264ab")
to_field "language_facet", marc_languages
to_field "format", marc_formats
to_field "isbn_t", extract_marc("020a:773z:776z:534z:556z")
to_field "lccn", extract_marc("010a")
to_field "material_type_display", extract_marc("300a", :separator => nil, :trim_punctuation => true)
to_field "title_t", extract_marc("245ak")
to_field "title1_t", extract_marc("245abk")
to_field "title2_t", extract_marc("245nps:130:240abcdefgklmnopqrs:210ab:222ab:242abcehnp:243abcdefgklmnopqrs:246abcdefgnp:247abcdefgnp")
to_field "title3_t", extract_marc("700gklmnoprst:710fgklmnopqrst:711fgklnpst:730abdefgklmnopqrst:740anp:505t:780abcrst:785abcrst:773abrst")
# Note we can mention the same field twice, these
# ones will be added on to what's already there. Some custom
# logic for extracting 505$t, but only from 505 field that
# also has $r -- we consider that more likely to be a titleish string
to_field "title3_t" do |record, accumulator|
record.each_by_tag('505') do |field|
if field['r']
accumulator.concat field.subfields.collect {|sf| sf.value if sf.code == 't'}.compact
end
end
end
to_field "title_display", extract_marc("245abk", :trim_punctuation => true, :first => true)
to_field "title_sort", marc_sortable_title
to_field "title_series_t", extract_marc("440a:490a:800abcdt:400abcd:810abcdt:410abcd:811acdeft:411acdef:830adfgklmnoprst:760ast:762ast")
to_field "series_facet", marc_series_facet
to_field "author_unstem", extract_marc("100abcdgqu:110abcdgnu:111acdegjnqu")
to_field "author2_unstem", extract_marc("700abcdegqu:710abcdegnu:711acdegjnqu:720a:505r:245c:191abcdegqu")
to_field "author_display", extract_marc("100abcdq:110:111")
to_field "author_sort", marc_sortable_author
to_field "author_facet", extract_marc("100abcdq:110abcdgnu:111acdenqu:700abcdq:710abcdgnu:711acdenqu", :trim_punctuation => true)
to_field "subject_t", extract_marc("600:610:611:630:650:651avxyz:653aa:654abcvyz:655abcvxyz:690abcdxyz:691abxyz:692abxyz:693abxyz:656akvxyz:657avxyz:652axyz:658abcd")
to_field "subject_topic_facet", extract_marc("600abcdtq:610abt:610x:611abt:611x:630aa:630x:648a:648x:650aa:650x:651a:651x:691a:691x:653aa:654ab:656aa:690a:690x",
:trim_punctuation => true, ) do |record, accumulator|
#upcase first letter if needed, in MeSH sometimes inconsistently downcased
accumulator.collect! do |value|
value.gsub(/\A[a-z]/) do |m|
m.upcase
end
end
end
to_field "subject_geo_facet", marc_geo_facet
to_field "subject_era_facet", marc_era_facet
# not doing this at present.
#to_field "subject_facet", extract_marc("600:610:611:630:650:651:655:690")
to_field "published_display", extract_marc("260a", :trim_punctuation => true)
to_field "pub_date", marc_publication_date
# An example of more complex ruby logic 'in line' in the config file--
# too much more complicated than this, and you'd probably want to extract
# it to an external routine to keep things tidy.
#
# Use traject's LCC to broad category routine, but then supply
# custom block to also use our local holdings 9xx info, and
# also classify sudoc-possessing records as 'Government Publication' discipline
to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |record, accumulator|
# add in our local call numbers
Traject::MarcExtractor.cached("991:937").each_matching_line(record) do |field, spec, extractor|
# we output call type 'processor' in subfield 'f' of our holdings
# fields, that sort of maybe tells us if it's an LCC field.
# When the data is right, which it often isn't.
call_type = field['f']
if call_type == "sudoc"
# we choose to call it:
accumulator << "Government Publication"
elsif call_type.nil? || call_type == "lc" || field['a'] =~ Traject::Macros::Marc21Semantics::LCC_REGEX
# run it through the map
s = field['a']
s = s.slice(0, 1) if s
accumulator << Traject::TranslationMap.new("lcc_top_level")[s]
end
end
# If it's got an 086, we'll put it in "Government Publication", to be
# consistent with when we do that from a local SuDoc call #.
if Traject::MarcExtractor.cached("086a").extract(record).length > 0
accumulator << "Government Publication"
end
# uniq it in case we added the same thing twice with GovPub
accumulator.uniq!
if accumulator.empty?
accumulator << "Unknown"
end
end
to_field "instrumentation_facet", marc_instrumentation_humanized
to_field "instrumentation_code_unstem", marc_instrument_codes_normalized
to_field "issn", extract_marc("022a:022l:022y:773x:774x:776x", :separator => nil)
to_field "issn_related", extract_marc("490x:440x:800x:400x:410x:411x:810x:811x:830x:700x:710x:711x:730x:780x:785x:777x:543x:760x:762x:765x:767x:770x:772x:775x:786x:787x", :separator => nil)
to_field "oclcnum_t", oclcnum
to_field "other_number_unstem", extract_marc("024a:028a")