Skip to content

Commit

Permalink
Major changes: Added Signal handling
Browse files Browse the repository at this point in the history
Dictionaries: Corrected typo in name of Substitution dictionary
LangUtils: Addressed an edge case when text contains too many "neutral" characters
FloatVector, StringVector: Added a data element for Signal shortkeys, added handling to emotion arithmetic
ContentIndexSummary: added a deep "copy" operation
HashUtils: Added hash function to calculate Signal shortkey
  • Loading branch information
datancoffee committed Nov 5, 2017
1 parent 661ff13 commit da4a600
Show file tree
Hide file tree
Showing 23 changed files with 298 additions and 69 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ You will also need to add a dependency to the model files
</dependency>
```

## How to Run Standard Test Cases

This repo contains a set of blog posts and other text documents that can be used as inputs for verification of changes. The main test script that invokes the Sirocco Indexer is located in the /src/test/scripts/ folder. It runs the indexer through all files with .txt extensions in the /src/test/resources/in folder and produces outputs in the /src/test/resources/out folder. To run the test script, execute the following command in shell

```
./src/test/scripts/runindexer.sh TOPSENTIMENTS
```

The test script accepts a single parameter - the indexing type. The acceptable values are FULLINDEX and TOPSENTIMENTS. When Top Sentiments is specified, the Indexer will select the top 4 sentence chunks (a few sequential sentences in text that have the same sentiement valence) in input text and output them in the output file. When Full Index is selected, all sentence chunks will be output.

## Additional Publications and Documentation

Expand All @@ -96,7 +105,7 @@ You will also need to add a dependency to the model files

## Want to Help or Get in Touch?

* Get in touch with @datancoffee on [Twitter](https://twitter.com/datancoffee) or [Medium](http://medium.com/datancoffee) if you want to help with the project or need help.
* Get in touch with @datancoffee on [Twitter](https://twitter.com/datancoffee) or [Medium](http://medium.com/@datancoffee) if you want to help with the project or need help.



Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>sirocco.sirocco-sa</groupId>
<artifactId>sirocco-sa</artifactId>
<version>1.0.3</version>
<version>1.0.6</version>
<packaging>jar</packaging>
<name>Sirocco Sentiment Analysis</name>
<description>The sentiment analysis SDK</description>
Expand Down Expand Up @@ -70,7 +70,7 @@
<dependency>
<groupId>sirocco.sirocco-mo</groupId>
<artifactId>sirocco-mo</artifactId>
<version>[1.0.0,2.0.0)</version>
<version>[1.0.3,2.0.0)</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/sirocco/indexer/DerivationStep.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ public class DerivationStep
{
public String Action;
public Span DerivationSpan;
public static String AccumulateAction = "accumulate";
public static final String AccumulateAction = "accumulate";
//public static string MultiplyAction = "multiply";
public static String NegateAndMultiplyAction = "negate-multiply";
public static final String NegateAndMultiplyAction = "negate-multiply";
//public static string NeutralizeAction = "neutralize";
public DerivationStep(String action, Span span) {
Action = action;
Expand Down
20 changes: 14 additions & 6 deletions src/main/java/sirocco/indexer/EnglishIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ public void index(ContentIndex contentindex) throws Exception {
selectTopTags(contentindex);
buildLabelledSentences(contentindex);
chunkLabelledSentences(contentindex);
selectSentiments(contentindex);
selectSentiments(contentindex);
contentindex.IsIndexingSuccessful = true;
contentindex.ActionTimestamps.put("Index:stop", Calendar.getInstance().getTime());
}

Expand Down Expand Up @@ -1248,9 +1249,11 @@ else if ((parsedepth == IndexingConsts.ParseDepth.SHALLOW) && (tlShallowAccumula
{
for (Parse parse : plainParses)
{
FloatVector parsevector = new FloatVector(parse.getSpan(), SentimentDimension.GeneralSentiment);
FloatVector parsevector = new FloatVector(parse.getSpan(), SentimentDimension.GeneralSentiment);
listvector.getValue().accumulate(parsevector);
}
FloatVector valueofchunk = mDicts.Modifiers.getWords().get("ADJP_ADVP_chunk");
listvector.getValue().accumulate(valueofchunk);
}

}
Expand Down Expand Up @@ -1483,10 +1486,15 @@ else if (tokens.length == 2)
vector = dict.getWords().get(bestbaseform + '/' + pos);
if (vector == null)
return null;

vector.accumulate(prefixvector);
vector.applyNegationAndMultiplication();
return vector;

// sso 10/19/2017 fix
FloatVector vectorcopy = new FloatVector();
vectorcopy.accumulate(vector);
vectorcopy.accumulate(prefixvector);
vectorcopy.applyNegationAndMultiplication();
return vectorcopy;
// end fix

}
else if (tokens.length == 1)
{
Expand Down
102 changes: 73 additions & 29 deletions src/main/java/sirocco/indexer/FloatVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
import sirocco.indexer.IndexingConsts.SentimentValence;
import sirocco.indexer.util.LangUtils;
import sirocco.model.LabelledSpan;
import sirocco.util.HashUtils;

import java.util.Arrays;
import java.util.HashMap;


Expand All @@ -46,32 +48,39 @@
*/
public class FloatVector extends HashMap<String,Float> implements IGenericVector
{
public static float DefaultValue = 0.0F;
public static float InitialValue = 10.0F;
public static String DimensionStart = "@@";
public static String ValueStart = "+";
public static String ZeroVector = "@@zero";
public static String ScoreDimension = "score";
public static String NegationDimension = "negation";
public static String SlyIronicSarcasticDimension = "sis";
public static String EntityDimension = "entity";
public static String PosOverrideDimension = "ignposoverride";
public static String OriginalTextDimension = "ignorgtxt";
public static String IsLinkDimension = "islink";
public static String RegexOptionDimension = "regexoption";
public static String IsHashTagDimension = "ishashtag";
public static String IsIgnoreDimension = "isignore";
public static String FlagEnding = "[flag]";
public static String KeyEnding = "[key]";
public static String IgnoreDimensionStart = "ign";
// dimensions with this prefix will be printed in intensitytoken, but will not be deserialized.
public static CSList<String> MultiplicativeDimensions = new CSList<String>(new String[]{ ScoreDimension });
public static final float DefaultValue = 0.0F;
public static final float InitialValue = 10.0F;
public static final String DimensionStart = "@@";
public static final String ValueStart = "+";
public static final String ZeroVector = "@@zero";
public static final String ScoreDimension = "score";
public static final String NegationDimension = "negation";
public static final String SlyIronicSarcasticDimension = "sis";
public static final String EntityDimension = "entity";
public static final String PosOverrideDimension = "ignposoverride";
public static final String OriginalTextDimension = "ignorgtxt";
public static final String SignalsDimension = "ignsignals";
public static final String IsLinkDimension = "islink";
public static final String RegexOptionDimension = "regexoption";
public static final String IsHashTagDimension = "ishashtag";
public static final String IsIgnoreDimension = "isignore";
public static final String FlagEnding = "[flag]";
public static final String KeyEnding = "[key]";
public static final String IgnoreDimensionStart = "ign";
public static final CSList<String> MultiplicativeDimensions = new CSList<String>(new String[]{ ScoreDimension }); // dimensions with this prefix will be printed in intensitytoken, but will not be deserialized.

public HashMap<String,Float> Flags;
private HashMap<String,CSList<DerivationStep>> derivationSteps;
private HashMap<String,CSList<DerivationStep>> derivationSteps; // Hashmap key is a sentiment dimension
private CSList<String> shortkeys; // List of Signal shortkeys (usually, hashes) of idioms, emotion or quality words, interjections etc, that appear in the text

public HashMap<String,CSList<DerivationStep>> getDerivationSteps() {
return derivationSteps;
}


public CSList<String> getShortkeys() {
return shortkeys;
}

private CSList<DerivationStep> dimDerivationSteps(String dimension) {
CSList<DerivationStep> dimlist = getDerivationSteps().get(dimension);
if (dimlist == null)
Expand All @@ -87,6 +96,7 @@ public FloatVector() {
super();
derivationSteps = new HashMap<String,CSList<DerivationStep>>();
Flags = new HashMap<String,Float>();
shortkeys = new CSList<String>();
}

public FloatVector(Span span, String initialDimension) {
Expand All @@ -96,7 +106,7 @@ public FloatVector(Span span, String initialDimension) {
this.accumulate(vector,span,true);
}

public void init(String[] keys, String[] fields) {
public void init(String[] keys, String[] fields, String vectorkey) {
for (int i = 0;i < keys.length;i++)
{
float value = (StringSupport.isNullOrEmpty(fields[i])) ? DefaultValue : InitialValue * Float.parseFloat(fields[i]);
Expand All @@ -108,6 +118,8 @@ public void init(String[] keys, String[] fields) {
else
this.put(keys[i], value);
}
String vectorshortkey = HashUtils.getShortkey(vectorkey);
this.getShortkeys().add(vectorshortkey);
}

public void initFromIntensityToken(String token) {
Expand All @@ -126,8 +138,18 @@ public void initFromIntensityToken(String token) {
float value = (StringSupport.isNullOrEmpty(pair[1])) ? DefaultValue : Float.parseFloat(pair[1]);
this.put(key, value);
}
else if (dimvalue.startsWith(SignalsDimension)) // import all shortkeys
{
String[] pair = StringSupport.Split(dimvalue, vsa, StringSplitOptions.None);
if (!StringSupport.isNullOrEmpty(pair[1])) {
String[] ssa = new String[]{ "," };
String[] shortkeyarr = StringSupport.Split(pair[1], ssa, StringSplitOptions.None);
shortkeys.addAll(new CSList<String>(shortkeyarr));
}
}

}

}

public String[] getDimensions() {
Expand All @@ -146,7 +168,20 @@ public String toCSV() {

return retvalue;
}


public static String toCSV(CSList<String> sa) {
String retvalue = null;
if (sa.size() > 0)
{
retvalue = "";
for (String s : sa)
retvalue += (retvalue.isEmpty() ? s : "," + s);
}

return retvalue;
}


public void accumulate(FloatVector othervector) {
accumulate(othervector,(Span)null,true);
}
Expand Down Expand Up @@ -212,6 +247,10 @@ public void accumulate(FloatVector othervector, CSList<Span> spans, boolean addD
}

}

// Merge shortkeys
this.getShortkeys().addAll(othervector.getShortkeys());

}

public void removeUnusedCombinationParts() {
Expand Down Expand Up @@ -520,13 +559,13 @@ public String toString() {
}
return sb.toString();
}
catch (RuntimeException __dummyCatchVar1)
catch (RuntimeException re)
{
throw __dummyCatchVar1;
throw re;
}
catch (Exception __dummyCatchVar1)
catch (Exception e)
{
throw new RuntimeException(__dummyCatchVar1);
throw new RuntimeException(e);
}

}
Expand All @@ -536,7 +575,12 @@ public String toIntensityToken() {
for (Entry<String,Float> kvp : this.entrySet())
{
sb.append(dimensionToken(kvp.getKey(),String.valueOf(kvp.getValue())));
}
}
if (getShortkeys().size() > 0)
{
String listofshortkeys=FloatVector.toCSV(getShortkeys());
sb.append(dimensionToken(SignalsDimension,listofshortkeys));
}
if (sb.length() == 0)
sb.append(ZeroVector);

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/sirocco/indexer/IGenericVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

public interface IGenericVector
{
void init(String[] keys, String[] fields) throws Exception ;
void init(String[] keys, String[] fields, String vectorkey) throws Exception ;

String[] getDimensions() throws Exception ;

Expand Down
8 changes: 6 additions & 2 deletions src/main/java/sirocco/indexer/Indexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,14 @@ public static void index(ContentIndex contentindex) throws Exception {

contentindex.initializeParagraphs(paragraphs);
contentindex.Language = LangUtils.determineDominantLanguage(paragraphs);
if (StringSupport.equals(contentindex.Language, Language.English))
if (contentindex.Language.equals(Language.English))
EnglishIndexerPool.getInstance().index(contentindex);
else
else if (contentindex.Language.equals(Language.Undetermined))
NonEnglishIndexerPool.getInstance().index(contentindex);
else {
contentindex.IndexingErrors = "Unindexable Text. Too many invalid characters";
contentindex.IsIndexingSuccessful = false;
}
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/main/java/sirocco/indexer/Language.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public class Language
{
public static String English = "EN";
public static String Undetermined = "UD";
public static String UnindexabeText = "UT";
}


1 change: 1 addition & 0 deletions src/main/java/sirocco/indexer/NonEnglishIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ public void index(ContentIndex contentindex) throws Exception {
ltext.LabelledPositions = new LabelledPositionsV2();
contentindex.SelectedSentiments = new CSList<LabelledText>();
contentindex.SelectedSentiments.add(ltext);
contentindex.IsIndexingSuccessful = true;
}

private HashMap<String,TextTag> calculateFrequencyStats(Document document, String[] words, RefSupport<String> summary) throws Exception {
Expand Down
15 changes: 12 additions & 3 deletions src/main/java/sirocco/indexer/StringVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import sirocco.indexer.DerivationStep;
import sirocco.indexer.IGenericVector;
import sirocco.indexer.util.LangUtils;
import sirocco.util.HashUtils;
import CS2JNet.System.StringSupport;

import java.util.HashMap;
Expand All @@ -39,11 +40,17 @@ public class StringVector extends HashMap<String,String> implements IGenericVect
public static String DefaultValue = "";
public static String FlagEnding = "[flag]";
public HashMap<String,Float> Flags;
private HashMap<String,CSList<DerivationStep>> derivationSteps;
private HashMap<String,CSList<DerivationStep>> derivationSteps; // Hashmap key is a sentiment dimension
private CSList<String> shortkeys; // List of shortkeys (usually, hashes) of idioms, emotion or quality words, interjections etc, that appear in the text

public HashMap<String,CSList<DerivationStep>> getDerivationSteps() throws Exception {
return derivationSteps;
}


public CSList<String> getShortkeys() {
return shortkeys;
}

private CSList<DerivationStep> dimDerivationSteps(String dimension) throws Exception {
CSList<DerivationStep> dimlist = getDerivationSteps().get(dimension);
if (dimlist == null)
Expand All @@ -61,7 +68,7 @@ public StringVector() {
Flags = new HashMap<String,Float>();
}

public void init(String[] keys, String[] fields) throws Exception {
public void init(String[] keys, String[] fields, String vectorkey) throws Exception {
for (int i = 0;i < keys.length;i++)
{
String value = (StringSupport.isNullOrEmpty(fields[i])) ? DefaultValue : fields[i];
Expand All @@ -73,6 +80,8 @@ public void init(String[] keys, String[] fields) throws Exception {
else
this.put(keys[i], value);
}
String vectorshortkey = HashUtils.getShortkey(vectorkey);
shortkeys.add(vectorshortkey);
}

public String[] getDimensions() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ else if (line.charAt(i) == ',')
}
TVector v = (TVector) vectorFactory.createNewVector();
String[] fieldnamesarray = fieldnames.toArray(new String[fieldnames.size()]);
v.init(fieldnamesarray, vectorfields);
v.init(fieldnamesarray, vectorfields,key);
getWords().put(key, v);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public class EnglishDictionaries
public EmotionDictionary Emotions;
public InterjectionDictionary Interjections;
public QualityDictionary Qualities;
public SubstituitionDictionary Substituitions;
public SubstitutionDictionary Substitutions;
public DegreeAdverbDictionary DegreeAdverbs;
public ModifierDictionary Modifiers;
public NegatorDictionary Negators;
Expand All @@ -60,7 +60,7 @@ public EnglishDictionaries() throws Exception {
Emotions = new EmotionDictionary(getClass().getResourceAsStream("/csdict/emotions-en.csv"));
Interjections = new InterjectionDictionary(getClass().getResourceAsStream("/csdict/interjections-en.csv"));
Qualities = new QualityDictionary(getClass().getResourceAsStream("/csdict/qualities-en.csv"));
Substituitions = new SubstituitionDictionary(getClass().getResourceAsStream("/csdict/substituitions-en.csv"));
Substitutions = new SubstitutionDictionary(getClass().getResourceAsStream("/csdict/substitutions-en.csv"));
DegreeAdverbs = new DegreeAdverbDictionary(getClass().getResourceAsStream("/csdict/degree-adv-en.csv"));
Modifiers = new ModifierDictionary(getClass().getResourceAsStream("/csdict/modifiers-en.csv"));
Negators = new NegatorDictionary(getClass().getResourceAsStream("/csdict/negators-en.csv"));
Expand Down
Loading

0 comments on commit da4a600

Please sign in to comment.