Skip to content

Commit

Permalink
move quotations from text to whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
spencermountain committed Jun 19, 2018
1 parent 74b6371 commit f1457f6
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 20 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
},
"scripts": {
"test": "tape \"./test/unit/**/*.test.js\" | tap-dancer",
"test:spec": "tape \"./test/unit/**/*.test.js\" | tap-spec",
"testb": "TESTENV=prod tape \"./test/unit/**/*.test.js\" | tap-dancer",
"buildTest": "TESTENV=prod node ./scripts/test.js",
"test:types": "tsc --project test/types",
Expand Down
4 changes: 2 additions & 2 deletions scratch.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ var nlp = require('./src/index');
// console.log(doc.list[0].terms[0]);
// console.log(doc.values().toNumber().out('text'));

var doc = nlp(`he lived (but barely) and died (in a pile)`);
var doc = nlp(`twas good cookin'`);
doc.debug();
doc.quotations().debug();
// doc.quotations().debug();


// console.log(nlp('Director of the F.B.I').acronyms().addPeriods().out('text'));
Expand Down
35 changes: 22 additions & 13 deletions src/tagger/steps/15-quotation_step.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,48 +86,49 @@ const quotemarks = {
};

// Open quote match black list.
const blacklist = [
'twas'
];
// const blacklist = [
// 'twas'
// ];

// Convert the close quote to a regex.
Object.keys(quotemarks).forEach((open) => {
quotemarks[open].regex = new RegExp(quotemarks[open].close + '[;:,.]*$');
quotemarks[open].regex = new RegExp(quotemarks[open].close + '[;:,.]*');
quotemarks[open].open = open;
});

// Improve open match detection.
const startQuote = new RegExp(
'^[' + Object.keys(quotemarks).join('') + ']+' +
'(?!' + blacklist.join('|') + ')'
'[' + Object.keys(quotemarks).join('') + ']'
// '(?!' + blacklist.join('|') + ')'
);

//tag a inline quotation as such
const quotation_step = ts => {
// Isolate the text so it doesn't change.
const terms = ts.terms.slice(0).map(e => e.text);
const terms = ts.terms.slice(0); //.map(e => e.text);
for (let i = 0; i < terms.length; i++) {

let t = ts.terms[i];
if (startQuote.test(t.text)) {
if (startQuote.test(t.whitespace.before)) {
// Get the match and split it into groups
let quotes = t.text.match(startQuote).shift().split('');
let quotes = t.whitespace.before.match(startQuote).shift().split('');
// Get close and tag info.
quotes = quotes.map(mark => quotemarks[mark]);
// Look for the ending
for (let o = 0; o < ts.terms.length; o++) {
// max-length don't go-on forever
if (!ts.terms[i + o] || o > 28) {
if (!terms[i + o] || o > 28) {
break;
}
// Find the close.
const index = quotes.findIndex(q => q.regex.test(terms[i + o]));
const index = quotes.findIndex(q => q.regex.test(terms[i + o].whitespace.after));
if (index !== -1) {
// Remove the found
const quote = quotes.splice(index, 1).pop();
terms[i + o] = terms[i + o].replace(quote.regex, '');
// terms[i + o].whitespace.after = terms[i + o].whitespace.after.replace(quote.regex, '');

if (quote.regex.test(ts.terms[i + o].normal)) {
ts.terms[i + o].normal.replace(quote.regex, '');
ts.terms[i + o].whitespace.after.replace(quote.regex, '');
}
// Tag the things.
t.tag('StartQuotation', 'quotation_open');
Expand All @@ -143,6 +144,14 @@ const quotation_step = ts => {
} // for subset
} // open quote
} // for all terms

//fix any issues post-process
if (ts.has('#StartQuotation') === true && ts.has('#EndQuotation') === false) {
// ts.unTag('Quotation');
}
if (ts.has('#EndQuotation') === true && ts.has('#StartQuotation') === false) {
// ts.unTag('Quotation');
}
return ts;
};
module.exports = quotation_step;
27 changes: 24 additions & 3 deletions src/term/whitespace.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,29 @@
'use strict';
//punctuation regs-
const before = /^(\s|-+|\.\.+|\/)+/;
// const quotes = [ //
// ['"', '"'],
// ['\u0022', '\u0022'],
// ['\uFF02', '\uFF02'],
// ['\u0027', '\u0027'],
// ['\u201C', '\u201D'],
// ['\u2018', '\u2019'],
// ['\u201F', '\u201D'],
// ['\u201B', '\u2019'],
// ['\u201E', '\u201D'],
// ['\u2E42', '\u201D'],
// ['\u201A', '\u2019'],
// ['\u00AB', '\u00BB'],
// ['\u2039', '\u203A'],
// ['\u2035', '\u2032'],
// ['\u2036', '\u2033'],
// ['\u2037', '\u2034'],
// ['\u301D', '\u301E'],
// ['\u0060', '\u00B4'],
// ['\u301F', '\u301E'],
// ];
//punctuation regs- are we having fun yet?
const before = /^(\s|-+|\.\.+|\/|"|\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F)+/u;
const after = /(\s+|-+|\.\.+|"|\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4)+$/u;
const minusNumber = /^( *)-(\$||¥|£)?([0-9])/;
const after = /(\s+|-+|\.\.+)+$/;

//seperate the 'meat' from the trailing/leading whitespace.
//works in concert with ./src/text/tokenize.js
Expand Down
2 changes: 1 addition & 1 deletion test/unit/subset/quotations/quotation-tag.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test('Quotations - U+0027 to U+0027', function (t) {
].forEach(a => testAllQuotes(a, t));
t.end();
});

//
test('Quotations - U+201C to U+201D', function (t) {
[
['he is \u201Creally good\u201D', 'really good'],
Expand Down
7 changes: 6 additions & 1 deletion test/unit/subset/quotations/quotations.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ test('quotation test', function(t) {
['so \'as if\' i said', 'as if'],
['the \'truthiness\' i said', 'truthiness'],
['yeah, “fun” and stuff', 'fun'],
['“Fun” and stuff', 'fun']
['“Fun” and stuff', 'fun'],
//dangling start/end
['\'twas good cookin', ''],
['twas good cookin\'', ''],
['twas \'good cookin\'', 'good cookin'],
['\'twas \'good cookin\'', 'twas good cookin']
];
arr.forEach(function(a) {
var r = nlp(a[0]);
Expand Down

0 comments on commit f1457f6

Please sign in to comment.