Skip to content

Commit

Permalink
OPENNLP-1539 - Introduce parameter for POSTaggerME to configure outpu…
Browse files Browse the repository at this point in the history
…t POS tag format
  • Loading branch information
rzo1 committed May 23, 2024
1 parent 64844b0 commit 8ef1a5a
Show file tree
Hide file tree
Showing 7 changed files with 321 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.postag;

import java.io.IOException;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;

public class POSTaggerMEIT {

private static POSTagger tagger;

@BeforeAll
public static void prepare() throws IOException {
tagger = new POSTaggerME("en");
}

@Test
void testPOSTagger() {

String[] tags = tagger.tag(new String[] {
"The",
"driver",
"got",
"badly",
"injured",
"."});

// TODO OPENNLP-1539 Adjust this depending on the POSFormat
String[] expected = {"DET", "NOUN", "VERB", "ADV", "VERB", "PUNCT"};
Assertions.assertArrayEquals(expected, tags);
}
/**
* Defines the format for part-of-speech tagging, i.e.
* <a href="https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">PENN</a>
* or <a href="https://universaldependencies.org/u/feat/index.html">UD</a> format.
*/
public enum POSTagFormat {

UD, PENN, UNKNOWN
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.postag;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* A mapping implementation for converting between different POS tag formats.
* This class supports conversion between Penn Treebank (PENN) and Universal Dependencies (UD) formats.
* The conversion is based on the <a href="https://universaldependencies.org/tagset-conversion/en-penn-uposf.html">Universal Dependencies conversion table.</a>
* Please note that when converting from UD to Penn format, there may be ambiguity in some cases.
*/
public class POSTagFormatMapper {

private static final Logger logger = LoggerFactory.getLogger(POSTagFormatMapper.class);

private static final Map<String, String> CONVERSION_TABLE_PENN_TO_UD = new HashMap<>();
private static final Map<String, String> CONVERSION_TABLE_UD_TO_PENN = new HashMap<>();

static {
/*
* This is a conversion table to convert PENN to UD format as described in
* https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
*/
CONVERSION_TABLE_PENN_TO_UD.put("#", "SYM");
CONVERSION_TABLE_PENN_TO_UD.put("$", "SYM");
CONVERSION_TABLE_PENN_TO_UD.put("''", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put(",", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put("-LRB-", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put("-RRB-", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put(".", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put(":", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put("AFX", "ADJ");
CONVERSION_TABLE_PENN_TO_UD.put("CC", "CCONJ");
CONVERSION_TABLE_PENN_TO_UD.put("CD", "NUM");
CONVERSION_TABLE_PENN_TO_UD.put("DT", "DET");
CONVERSION_TABLE_PENN_TO_UD.put("EX", "PRON");
CONVERSION_TABLE_PENN_TO_UD.put("FW", "X");
CONVERSION_TABLE_PENN_TO_UD.put("HYPH", "PUNCT");
CONVERSION_TABLE_PENN_TO_UD.put("IN", "ADP");
CONVERSION_TABLE_PENN_TO_UD.put("JJ", "ADJ");
CONVERSION_TABLE_PENN_TO_UD.put("JJR", "ADJ");
CONVERSION_TABLE_PENN_TO_UD.put("JJS", "ADJ");
CONVERSION_TABLE_PENN_TO_UD.put("LS", "X");
CONVERSION_TABLE_PENN_TO_UD.put("MD", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("NIL", "X");
CONVERSION_TABLE_PENN_TO_UD.put("NN", "NOUN");
CONVERSION_TABLE_PENN_TO_UD.put("NNP", "PROPN");
CONVERSION_TABLE_PENN_TO_UD.put("NNPS", "PROPN");
CONVERSION_TABLE_PENN_TO_UD.put("NNS", "NOUN");
CONVERSION_TABLE_PENN_TO_UD.put("PDT", "DET");
CONVERSION_TABLE_PENN_TO_UD.put("POS", "PART");
CONVERSION_TABLE_PENN_TO_UD.put("PRP", "PRON");
CONVERSION_TABLE_PENN_TO_UD.put("PRP$", "DET");
CONVERSION_TABLE_PENN_TO_UD.put("RB", "ADV");
CONVERSION_TABLE_PENN_TO_UD.put("RBR", "ADV");
CONVERSION_TABLE_PENN_TO_UD.put("RBS", "ADV");
CONVERSION_TABLE_PENN_TO_UD.put("RP", "ADP");
CONVERSION_TABLE_PENN_TO_UD.put("SYM", "SYM");
CONVERSION_TABLE_PENN_TO_UD.put("TO", "PART");
CONVERSION_TABLE_PENN_TO_UD.put("UH", "INTJ");
CONVERSION_TABLE_PENN_TO_UD.put("VB", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("VBD", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("VBG", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("VBN", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("VBP", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("VBZ", "VERB");
CONVERSION_TABLE_PENN_TO_UD.put("WDT", "DET");
CONVERSION_TABLE_PENN_TO_UD.put("WP", "PRON");
CONVERSION_TABLE_PENN_TO_UD.put("WP$", "DET");
CONVERSION_TABLE_PENN_TO_UD.put("WRB", "ADV");

/*
* Note: The back conversion might lose information.
*/
CONVERSION_TABLE_UD_TO_PENN.put("ADJ", "JJ");
CONVERSION_TABLE_UD_TO_PENN.put("ADP", "IN");
CONVERSION_TABLE_UD_TO_PENN.put("ADV", "RB");
CONVERSION_TABLE_UD_TO_PENN.put("AUX", "MD");
CONVERSION_TABLE_UD_TO_PENN.put("CCONJ", "CC");
CONVERSION_TABLE_UD_TO_PENN.put("DET", "DT");
CONVERSION_TABLE_UD_TO_PENN.put("INTJ", "UH");
CONVERSION_TABLE_UD_TO_PENN.put("NOUN", "NN");
CONVERSION_TABLE_UD_TO_PENN.put("NUM", "CD");
CONVERSION_TABLE_UD_TO_PENN.put("PART", "RP");
CONVERSION_TABLE_UD_TO_PENN.put("PRON", "PRP");
CONVERSION_TABLE_UD_TO_PENN.put("PROPN", "NNP");
CONVERSION_TABLE_UD_TO_PENN.put("PUNCT", ".");
CONVERSION_TABLE_UD_TO_PENN.put("SCONJ", "IN");
CONVERSION_TABLE_UD_TO_PENN.put("SYM", "SYM");
CONVERSION_TABLE_UD_TO_PENN.put("VERB", "VB");
CONVERSION_TABLE_UD_TO_PENN.put("X", "FW");
}

private final POSTagFormat modelFormat;

protected POSTagFormatMapper(final String[] possibleOutcomes) {
Objects.requireNonNull(possibleOutcomes, "Outcomes must not be NULL.");
this.modelFormat = guessModelTagFormat(possibleOutcomes);
}

/**
* Converts a given tag to the specified format.
*
* @param tags a list of tags to be converted.
* @return the converted tag.
*/
public String[] convertTags(List<String> tags) {
Objects.requireNonNull(tags, "Supplied tags must not be NULL.");
return tags.stream()
.map(this::convertTag)
.toArray(String[]::new);
}

/**
* Converts a given tag to the specified format.
*
* @param tag no restrictions on this parameter.
* @return the converted tag.
*/
public String convertTag(String tag) {
switch (modelFormat) {
case UD -> {
return CONVERSION_TABLE_UD_TO_PENN.getOrDefault(tag, "?");
}
case PENN -> {
if ("NOUN".equals(tag)) {
logger.warn("Ambiguity detected: NN can be 'NN' or 'NNS' depending on the number. " +
"Returning 'NN'.");
}
if ("PART".equals(tag)) {
logger.warn("Ambiguity detected: PART can be 'RP' or 'TO'. Returning 'TO'.");
}
if ("PROPN".equals(tag)) {
logger.warn("Ambiguity detected: Can be 'NNP' or 'NNPS. Returning 'NNP'");
}
if ("PUNCT".equals(tag)) {
logger.warn("Ambiguity detected: PUNCT needs specific punctuation mapping. Returning '.'");
}
if ("VERB".equals(tag)) {
logger.warn("Ambiguity detected: VERB can be 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'. " +
"Returning 'VERB'.");
}
return CONVERSION_TABLE_PENN_TO_UD.getOrDefault(tag, "?");
}
default -> {
return tag;
}
}
}

/**
*
* @return The guessed {@link POSTagFormat}. Guaranteed to be not {@code null}.
*/
public POSTagFormat getGuessedFormat() {
return this.modelFormat;
}

/**
* Guesses the {@link POSTagFormat} by using majority quorum.
* @param outcomes must not be {@code null}.
* @return the guessed {@link POSTagFormat}.
*/
private POSTagFormat guessModelTagFormat(final String[] outcomes) {
int udMatches = 0;
int pennMatches = 0;

for (String outcome : outcomes) {
if (CONVERSION_TABLE_UD_TO_PENN.containsKey(outcome)) {
udMatches++;
}
if (CONVERSION_TABLE_PENN_TO_UD.containsKey(outcome)) {
pennMatches++;
}
}

if (udMatches > pennMatches) {
return POSTagFormat.UD;
} else if (pennMatches > udMatches) {
return POSTagFormat.PENN;
} else {
logger.warn("Detected an unknown POS format.");
return POSTagFormat.UNKNOWN;
}
}
}
Loading

0 comments on commit 8ef1a5a

Please sign in to comment.