Skip to content

Commit

Permalink
Adds some punctuation massaging.
Browse files Browse the repository at this point in the history
  • Loading branch information
hzafar committed Jan 5, 2021
1 parent 43b8a14 commit b3b0963
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 7 deletions.
21 changes: 21 additions & 0 deletions src/main/kotlin/ca/voidstarzero/isbd/ParserUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,27 @@ fun droppedPeriods(input: String): List<String> {
}
}

// idea is we want to detect improper punctuation and
// try to correct it to give a better parse. this may
// ultimately not be the right way to do this.
fun punctuationFixes(input: MARCField): MARCField {
// if colon at subfield boundary is missing preceding space, add it in
val withColonSpaces = input.subfields.map {
if (it.second.matches(".*[^ ]:$".toRegex())) {
val fieldData = it.second.substring(0, it.second.length - 1) + " :"
Pair(it.first, fieldData)
} else {
it
}
}

return MARCField(
input.tag,
input.indicators,
withColonSpaces
)
}

fun usesISBD(input: MARCField): Boolean {
if (input.subfields.any { it.first == 'c' }
&& !input.fieldData().contains(" / ")) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package ca.voidstarzero.isbd.titlestatement

import ca.voidstarzero.isbd.cleanInput
import ca.voidstarzero.isbd.prepare
import ca.voidstarzero.isbd.simpleParse
import ca.voidstarzero.isbd.*
import ca.voidstarzero.isbd.titlestatement.ast.TitleStatement
import ca.voidstarzero.isbd.titlestatement.grammar.TitleStatementGrammar
import ca.voidstarzero.isbd.titlestatement.grammar.monographRoot
import ca.voidstarzero.isbd.titlestatement.grammar.seriesRootWithMARC
import ca.voidstarzero.isbd.usesISBD
import ca.voidstarzero.marc.MARCField
import ca.voidstarzero.marc.fieldData
import norswap.autumn.Autumn
Expand Down Expand Up @@ -59,7 +56,8 @@ class TitleStatementParser : TitleStatementGrammar() {
}

val parseRoot: rule = seriesRootWithMARC(marc)
val result = Autumn.parse(parseRoot, cleanInput(marc.fieldData()), ParseOptions.get())
val preprocessedMarc = punctuationFixes(marc)
val result = Autumn.parse(parseRoot, cleanInput(preprocessedMarc.fieldData()), ParseOptions.get())
if (result.full_match) {
return result.value_stack.mapNotNull { it as TitleStatement }
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ class SeriesTitleTest {
)
),
entrySors = listOf(
SOR("Bulgarian Academy of Sciences, Centre for Scientific " +
"Information and Documentation")
SOR(
"Bulgarian Academy of Sciences, Centre for Scientific " +
"Information and Documentation"
)
)
)
)
Expand Down Expand Up @@ -1041,4 +1043,34 @@ class SeriesTitleTest {
assertNotNull(result)
assertEquals(expected, result)
}

@Test
fun t28() {
val marc = MARCField(
"245",
"|aState summary:|pType and amount of aids paid to all local governmental units and counties.",
'|'
)

val expected = listOf(
TitleStatement(
titles = listOf(
Series(
seriesTitle = SeriesTitle(
title = "State summary",
otherInfo = listOf(
SeriesOtherInfo("Type and amount of aids paid to all local governmental units and counties")
)
)
)
)
)
)

val result = t.parseSerial(marc)

assertNotNull(result)
assertEquals(expected, result)
}

}

0 comments on commit b3b0963

Please sign in to comment.