diff --git a/README.md b/README.md index cecb2e4..9a1f789 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,15 @@ You will also need to add a dependency to the model files ``` +## How to Run Standard Test Cases + +This repo contains a set of blog posts and other text documents that can be used as inputs for verification of changes. The main test script that invokes the Sirocco Indexer is located in the /src/test/scripts/ folder. It runs the indexer through all files with .txt extensions in the /src/test/resources/in folder and produces outputs in the /src/test/resources/out folder. To run the test script, execute the following command in shell + +``` +./src/test/scripts/runindexer.sh TOPSENTIMENTS +``` + +The test script accepts a single parameter - the indexing type. The acceptable values are FULLINDEX and TOPSENTIMENTS. When Top Sentiments is specified, the Indexer will select the top 4 sentence chunks (a few sequential sentences in text that have the same sentiement valence) in input text and output them in the output file. When Full Index is selected, all sentence chunks will be output. ## Additional Publications and Documentation @@ -96,7 +105,7 @@ You will also need to add a dependency to the model files ## Want to Help or Get in Touch? -* Get in touch with @datancoffee on [Twitter](https://twitter.com/datancoffee) or [Medium](http://medium.com/datancoffee) if you want to help with the project or need help. +* Get in touch with @datancoffee on [Twitter](https://twitter.com/datancoffee) or [Medium](http://medium.com/@datancoffee) if you want to help with the project or need help. diff --git a/pom.xml b/pom.xml index 12ff163..04005fd 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 sirocco.sirocco-sa sirocco-sa - 1.0.3 + 1.0.6 jar Sirocco Sentiment Analysis The sentiment analysis SDK @@ -70,7 +70,7 @@ sirocco.sirocco-mo sirocco-mo - [1.0.0,2.0.0) + [1.0.3,2.0.0) commons-lang diff --git a/src/main/java/sirocco/indexer/DerivationStep.java b/src/main/java/sirocco/indexer/DerivationStep.java index 99a38bb..8e7a369 100644 --- a/src/main/java/sirocco/indexer/DerivationStep.java +++ b/src/main/java/sirocco/indexer/DerivationStep.java @@ -35,9 +35,9 @@ public class DerivationStep { public String Action; public Span DerivationSpan; - public static String AccumulateAction = "accumulate"; + public static final String AccumulateAction = "accumulate"; //public static string MultiplyAction = "multiply"; - public static String NegateAndMultiplyAction = "negate-multiply"; + public static final String NegateAndMultiplyAction = "negate-multiply"; //public static string NeutralizeAction = "neutralize"; public DerivationStep(String action, Span span) { Action = action; diff --git a/src/main/java/sirocco/indexer/EnglishIndexer.java b/src/main/java/sirocco/indexer/EnglishIndexer.java index b556cbf..11d2766 100644 --- a/src/main/java/sirocco/indexer/EnglishIndexer.java +++ b/src/main/java/sirocco/indexer/EnglishIndexer.java @@ -188,7 +188,8 @@ public void index(ContentIndex contentindex) throws Exception { selectTopTags(contentindex); buildLabelledSentences(contentindex); chunkLabelledSentences(contentindex); - selectSentiments(contentindex); + selectSentiments(contentindex); + contentindex.IsIndexingSuccessful = true; contentindex.ActionTimestamps.put("Index:stop", Calendar.getInstance().getTime()); } @@ -1248,9 +1249,11 @@ else if ((parsedepth == IndexingConsts.ParseDepth.SHALLOW) && (tlShallowAccumula { for (Parse parse : plainParses) { - FloatVector parsevector = new FloatVector(parse.getSpan(), SentimentDimension.GeneralSentiment); + FloatVector parsevector = new FloatVector(parse.getSpan(), SentimentDimension.GeneralSentiment); listvector.getValue().accumulate(parsevector); } + FloatVector valueofchunk = mDicts.Modifiers.getWords().get("ADJP_ADVP_chunk"); + listvector.getValue().accumulate(valueofchunk); } } @@ -1483,10 +1486,15 @@ else if (tokens.length == 2) vector = dict.getWords().get(bestbaseform + '/' + pos); if (vector == null) return null; - - vector.accumulate(prefixvector); - vector.applyNegationAndMultiplication(); - return vector; + + // sso 10/19/2017 fix + FloatVector vectorcopy = new FloatVector(); + vectorcopy.accumulate(vector); + vectorcopy.accumulate(prefixvector); + vectorcopy.applyNegationAndMultiplication(); + return vectorcopy; + // end fix + } else if (tokens.length == 1) { diff --git a/src/main/java/sirocco/indexer/FloatVector.java b/src/main/java/sirocco/indexer/FloatVector.java index 14c5fa4..30de0bc 100644 --- a/src/main/java/sirocco/indexer/FloatVector.java +++ b/src/main/java/sirocco/indexer/FloatVector.java @@ -37,7 +37,9 @@ import sirocco.indexer.IndexingConsts.SentimentValence; import sirocco.indexer.util.LangUtils; import sirocco.model.LabelledSpan; +import sirocco.util.HashUtils; +import java.util.Arrays; import java.util.HashMap; @@ -46,32 +48,39 @@ */ public class FloatVector extends HashMap implements IGenericVector { - public static float DefaultValue = 0.0F; - public static float InitialValue = 10.0F; - public static String DimensionStart = "@@"; - public static String ValueStart = "+"; - public static String ZeroVector = "@@zero"; - public static String ScoreDimension = "score"; - public static String NegationDimension = "negation"; - public static String SlyIronicSarcasticDimension = "sis"; - public static String EntityDimension = "entity"; - public static String PosOverrideDimension = "ignposoverride"; - public static String OriginalTextDimension = "ignorgtxt"; - public static String IsLinkDimension = "islink"; - public static String RegexOptionDimension = "regexoption"; - public static String IsHashTagDimension = "ishashtag"; - public static String IsIgnoreDimension = "isignore"; - public static String FlagEnding = "[flag]"; - public static String KeyEnding = "[key]"; - public static String IgnoreDimensionStart = "ign"; - // dimensions with this prefix will be printed in intensitytoken, but will not be deserialized. - public static CSList MultiplicativeDimensions = new CSList(new String[]{ ScoreDimension }); + public static final float DefaultValue = 0.0F; + public static final float InitialValue = 10.0F; + public static final String DimensionStart = "@@"; + public static final String ValueStart = "+"; + public static final String ZeroVector = "@@zero"; + public static final String ScoreDimension = "score"; + public static final String NegationDimension = "negation"; + public static final String SlyIronicSarcasticDimension = "sis"; + public static final String EntityDimension = "entity"; + public static final String PosOverrideDimension = "ignposoverride"; + public static final String OriginalTextDimension = "ignorgtxt"; + public static final String SignalsDimension = "ignsignals"; + public static final String IsLinkDimension = "islink"; + public static final String RegexOptionDimension = "regexoption"; + public static final String IsHashTagDimension = "ishashtag"; + public static final String IsIgnoreDimension = "isignore"; + public static final String FlagEnding = "[flag]"; + public static final String KeyEnding = "[key]"; + public static final String IgnoreDimensionStart = "ign"; + public static final CSList MultiplicativeDimensions = new CSList(new String[]{ ScoreDimension }); // dimensions with this prefix will be printed in intensitytoken, but will not be deserialized. + public HashMap Flags; - private HashMap> derivationSteps; + private HashMap> derivationSteps; // Hashmap key is a sentiment dimension + private CSList shortkeys; // List of Signal shortkeys (usually, hashes) of idioms, emotion or quality words, interjections etc, that appear in the text + public HashMap> getDerivationSteps() { return derivationSteps; } - + + public CSList getShortkeys() { + return shortkeys; + } + private CSList dimDerivationSteps(String dimension) { CSList dimlist = getDerivationSteps().get(dimension); if (dimlist == null) @@ -87,6 +96,7 @@ public FloatVector() { super(); derivationSteps = new HashMap>(); Flags = new HashMap(); + shortkeys = new CSList(); } public FloatVector(Span span, String initialDimension) { @@ -96,7 +106,7 @@ public FloatVector(Span span, String initialDimension) { this.accumulate(vector,span,true); } - public void init(String[] keys, String[] fields) { + public void init(String[] keys, String[] fields, String vectorkey) { for (int i = 0;i < keys.length;i++) { float value = (StringSupport.isNullOrEmpty(fields[i])) ? DefaultValue : InitialValue * Float.parseFloat(fields[i]); @@ -108,6 +118,8 @@ public void init(String[] keys, String[] fields) { else this.put(keys[i], value); } + String vectorshortkey = HashUtils.getShortkey(vectorkey); + this.getShortkeys().add(vectorshortkey); } public void initFromIntensityToken(String token) { @@ -126,8 +138,18 @@ public void initFromIntensityToken(String token) { float value = (StringSupport.isNullOrEmpty(pair[1])) ? DefaultValue : Float.parseFloat(pair[1]); this.put(key, value); } + else if (dimvalue.startsWith(SignalsDimension)) // import all shortkeys + { + String[] pair = StringSupport.Split(dimvalue, vsa, StringSplitOptions.None); + if (!StringSupport.isNullOrEmpty(pair[1])) { + String[] ssa = new String[]{ "," }; + String[] shortkeyarr = StringSupport.Split(pair[1], ssa, StringSplitOptions.None); + shortkeys.addAll(new CSList(shortkeyarr)); + } + } } + } public String[] getDimensions() { @@ -146,7 +168,20 @@ public String toCSV() { return retvalue; } - + + public static String toCSV(CSList sa) { + String retvalue = null; + if (sa.size() > 0) + { + retvalue = ""; + for (String s : sa) + retvalue += (retvalue.isEmpty() ? s : "," + s); + } + + return retvalue; + } + + public void accumulate(FloatVector othervector) { accumulate(othervector,(Span)null,true); } @@ -212,6 +247,10 @@ public void accumulate(FloatVector othervector, CSList spans, boolean addD } } + + // Merge shortkeys + this.getShortkeys().addAll(othervector.getShortkeys()); + } public void removeUnusedCombinationParts() { @@ -520,13 +559,13 @@ public String toString() { } return sb.toString(); } - catch (RuntimeException __dummyCatchVar1) + catch (RuntimeException re) { - throw __dummyCatchVar1; + throw re; } - catch (Exception __dummyCatchVar1) + catch (Exception e) { - throw new RuntimeException(__dummyCatchVar1); + throw new RuntimeException(e); } } @@ -536,7 +575,12 @@ public String toIntensityToken() { for (Entry kvp : this.entrySet()) { sb.append(dimensionToken(kvp.getKey(),String.valueOf(kvp.getValue()))); - } + } + if (getShortkeys().size() > 0) + { + String listofshortkeys=FloatVector.toCSV(getShortkeys()); + sb.append(dimensionToken(SignalsDimension,listofshortkeys)); + } if (sb.length() == 0) sb.append(ZeroVector); diff --git a/src/main/java/sirocco/indexer/IGenericVector.java b/src/main/java/sirocco/indexer/IGenericVector.java index 5f7b19d..f45b1c1 100644 --- a/src/main/java/sirocco/indexer/IGenericVector.java +++ b/src/main/java/sirocco/indexer/IGenericVector.java @@ -29,7 +29,7 @@ public interface IGenericVector { - void init(String[] keys, String[] fields) throws Exception ; + void init(String[] keys, String[] fields, String vectorkey) throws Exception ; String[] getDimensions() throws Exception ; diff --git a/src/main/java/sirocco/indexer/Indexer.java b/src/main/java/sirocco/indexer/Indexer.java index 21539e4..a2c58e7 100644 --- a/src/main/java/sirocco/indexer/Indexer.java +++ b/src/main/java/sirocco/indexer/Indexer.java @@ -46,10 +46,14 @@ public static void index(ContentIndex contentindex) throws Exception { contentindex.initializeParagraphs(paragraphs); contentindex.Language = LangUtils.determineDominantLanguage(paragraphs); - if (StringSupport.equals(contentindex.Language, Language.English)) + if (contentindex.Language.equals(Language.English)) EnglishIndexerPool.getInstance().index(contentindex); - else + else if (contentindex.Language.equals(Language.Undetermined)) NonEnglishIndexerPool.getInstance().index(contentindex); + else { + contentindex.IndexingErrors = "Unindexable Text. Too many invalid characters"; + contentindex.IsIndexingSuccessful = false; + } } /** diff --git a/src/main/java/sirocco/indexer/Language.java b/src/main/java/sirocco/indexer/Language.java index f6bfc25..6df4891 100644 --- a/src/main/java/sirocco/indexer/Language.java +++ b/src/main/java/sirocco/indexer/Language.java @@ -34,6 +34,7 @@ public class Language { public static String English = "EN"; public static String Undetermined = "UD"; + public static String UnindexabeText = "UT"; } diff --git a/src/main/java/sirocco/indexer/NonEnglishIndexer.java b/src/main/java/sirocco/indexer/NonEnglishIndexer.java index d923328..f13c9e1 100644 --- a/src/main/java/sirocco/indexer/NonEnglishIndexer.java +++ b/src/main/java/sirocco/indexer/NonEnglishIndexer.java @@ -81,6 +81,7 @@ public void index(ContentIndex contentindex) throws Exception { ltext.LabelledPositions = new LabelledPositionsV2(); contentindex.SelectedSentiments = new CSList(); contentindex.SelectedSentiments.add(ltext); + contentindex.IsIndexingSuccessful = true; } private HashMap calculateFrequencyStats(Document document, String[] words, RefSupport summary) throws Exception { diff --git a/src/main/java/sirocco/indexer/StringVector.java b/src/main/java/sirocco/indexer/StringVector.java index b91dd81..7a2b617 100644 --- a/src/main/java/sirocco/indexer/StringVector.java +++ b/src/main/java/sirocco/indexer/StringVector.java @@ -30,6 +30,7 @@ import sirocco.indexer.DerivationStep; import sirocco.indexer.IGenericVector; import sirocco.indexer.util.LangUtils; +import sirocco.util.HashUtils; import CS2JNet.System.StringSupport; import java.util.HashMap; @@ -39,11 +40,17 @@ public class StringVector extends HashMap implements IGenericVect public static String DefaultValue = ""; public static String FlagEnding = "[flag]"; public HashMap Flags; - private HashMap> derivationSteps; + private HashMap> derivationSteps; // Hashmap key is a sentiment dimension + private CSList shortkeys; // List of shortkeys (usually, hashes) of idioms, emotion or quality words, interjections etc, that appear in the text + public HashMap> getDerivationSteps() throws Exception { return derivationSteps; } - + + public CSList getShortkeys() { + return shortkeys; + } + private CSList dimDerivationSteps(String dimension) throws Exception { CSList dimlist = getDerivationSteps().get(dimension); if (dimlist == null) @@ -61,7 +68,7 @@ public StringVector() { Flags = new HashMap(); } - public void init(String[] keys, String[] fields) throws Exception { + public void init(String[] keys, String[] fields, String vectorkey) throws Exception { for (int i = 0;i < keys.length;i++) { String value = (StringSupport.isNullOrEmpty(fields[i])) ? DefaultValue : fields[i]; @@ -73,6 +80,8 @@ public void init(String[] keys, String[] fields) throws Exception { else this.put(keys[i], value); } + String vectorshortkey = HashUtils.getShortkey(vectorkey); + shortkeys.add(vectorshortkey); } public String[] getDimensions() throws Exception { diff --git a/src/main/java/sirocco/indexer/dictionaries/GenericDictionary.java b/src/main/java/sirocco/indexer/dictionaries/GenericDictionary.java index c8aeaa8..2895f22 100644 --- a/src/main/java/sirocco/indexer/dictionaries/GenericDictionary.java +++ b/src/main/java/sirocco/indexer/dictionaries/GenericDictionary.java @@ -169,7 +169,7 @@ else if (line.charAt(i) == ',') } TVector v = (TVector) vectorFactory.createNewVector(); String[] fieldnamesarray = fieldnames.toArray(new String[fieldnames.size()]); - v.init(fieldnamesarray, vectorfields); + v.init(fieldnamesarray, vectorfields,key); getWords().put(key, v); } diff --git a/src/main/java/sirocco/indexer/dictionaries/en/EnglishDictionaries.java b/src/main/java/sirocco/indexer/dictionaries/en/EnglishDictionaries.java index 3e8024d..cd7e62f 100644 --- a/src/main/java/sirocco/indexer/dictionaries/en/EnglishDictionaries.java +++ b/src/main/java/sirocco/indexer/dictionaries/en/EnglishDictionaries.java @@ -43,7 +43,7 @@ public class EnglishDictionaries public EmotionDictionary Emotions; public InterjectionDictionary Interjections; public QualityDictionary Qualities; - public SubstituitionDictionary Substituitions; + public SubstitutionDictionary Substitutions; public DegreeAdverbDictionary DegreeAdverbs; public ModifierDictionary Modifiers; public NegatorDictionary Negators; @@ -60,7 +60,7 @@ public EnglishDictionaries() throws Exception { Emotions = new EmotionDictionary(getClass().getResourceAsStream("/csdict/emotions-en.csv")); Interjections = new InterjectionDictionary(getClass().getResourceAsStream("/csdict/interjections-en.csv")); Qualities = new QualityDictionary(getClass().getResourceAsStream("/csdict/qualities-en.csv")); - Substituitions = new SubstituitionDictionary(getClass().getResourceAsStream("/csdict/substituitions-en.csv")); + Substitutions = new SubstitutionDictionary(getClass().getResourceAsStream("/csdict/substitutions-en.csv")); DegreeAdverbs = new DegreeAdverbDictionary(getClass().getResourceAsStream("/csdict/degree-adv-en.csv")); Modifiers = new ModifierDictionary(getClass().getResourceAsStream("/csdict/modifiers-en.csv")); Negators = new NegatorDictionary(getClass().getResourceAsStream("/csdict/negators-en.csv")); diff --git a/src/main/java/sirocco/indexer/dictionaries/en/SubstituitionDictionary.java b/src/main/java/sirocco/indexer/dictionaries/en/SubstitutionDictionary.java similarity index 86% rename from src/main/java/sirocco/indexer/dictionaries/en/SubstituitionDictionary.java rename to src/main/java/sirocco/indexer/dictionaries/en/SubstitutionDictionary.java index ff1a96c..7b1f710 100644 --- a/src/main/java/sirocco/indexer/dictionaries/en/SubstituitionDictionary.java +++ b/src/main/java/sirocco/indexer/dictionaries/en/SubstitutionDictionary.java @@ -32,9 +32,9 @@ import sirocco.indexer.StringVectorFactory; import sirocco.indexer.dictionaries.GenericDictionary; -public class SubstituitionDictionary extends GenericDictionary +public class SubstitutionDictionary extends GenericDictionary { - public SubstituitionDictionary(InputStream dictionarystream) throws Exception { + public SubstitutionDictionary(InputStream dictionarystream) throws Exception { super(dictionarystream,new StringVectorFactory()); } diff --git a/src/main/java/sirocco/indexer/util/LangUtils.java b/src/main/java/sirocco/indexer/util/LangUtils.java index 2319a1d..027a871 100644 --- a/src/main/java/sirocco/indexer/util/LangUtils.java +++ b/src/main/java/sirocco/indexer/util/LangUtils.java @@ -90,7 +90,11 @@ else if (Character.isLetter(c)) neutralCount++; } } - if ((float)(foreignCount / (float)(latinCount + neutralCount + foreignCount)) > 0.1F) + if ( ((float)(neutralCount / (float)(latinCount + neutralCount + foreignCount)) > 0.50F) && (neutralCount > 50) ) + // too many of neutral chars will cause extreme delays in processing. Also, this is not a real text blurb + //"Too many invalid characters. Latin chars "+latinCount+", Non-latin chars "+foreignCount+", Neutral chars "+neutralCount); + return Language.UnindexabeText; + else if ((float)(foreignCount / (float)(latinCount + neutralCount + foreignCount)) > 0.1F) return Language.Undetermined; else return Language.English; diff --git a/src/main/java/sirocco/indexer/util/LogUtils.java b/src/main/java/sirocco/indexer/util/LogUtils.java index a70cba4..8e704ea 100644 --- a/src/main/java/sirocco/indexer/util/LogUtils.java +++ b/src/main/java/sirocco/indexer/util/LogUtils.java @@ -109,6 +109,9 @@ public static void printTopSentiments(long taskID, ContentIndex contentindex, St String data = ltext.LabelledPositions.stringSerialize(); sb.append(data); sb.append(System.lineSeparator()); + sb.append("Sentiment {" + i + "} Signals: "); + sb.append(LangUtils.printStringList(ltext.getContainedSignalShortkeys(),", ")); + sb.append(System.lineSeparator()); } } diff --git a/src/main/java/sirocco/model/ContentIndex.java b/src/main/java/sirocco/model/ContentIndex.java index ff1f0dd..40153fa 100644 --- a/src/main/java/sirocco/model/ContentIndex.java +++ b/src/main/java/sirocco/model/ContentIndex.java @@ -109,7 +109,9 @@ public class ContentIndex /* Final results of indexing */ public TextTag[] TopTags; public CSList SelectedSentiments; - + public Boolean IsIndexingSuccessful = false; + public String IndexingErrors; + /** * Temporary performance stats. */ diff --git a/src/main/java/sirocco/model/LabelledText.java b/src/main/java/sirocco/model/LabelledText.java index b3e5591..3e2c89b 100644 --- a/src/main/java/sirocco/model/LabelledText.java +++ b/src/main/java/sirocco/model/LabelledText.java @@ -42,15 +42,14 @@ public class LabelledText public String Text; public LabelledPositionsV2 LabelledPositions; public CSList ContainedEntities = new CSList(); + public void addContainedEntity(String entity, int rank) throws Exception { if (!ContainedEntities.contains(entity)) { ContainedEntities.add(entity); if (ContainedEntityTopRank > rank) ContainedEntityTopRank = rank; - } - } public void addSentence(LabelledSentence lsentence, FloatVector sentiment) throws Exception { @@ -60,6 +59,11 @@ public void addSentence(LabelledSentence lsentence, FloatVector sentiment) throw addContainedEntity(entity,lsentence.ContainedEntityTopRank); } + public CSList getContainedSignalShortkeys() + { + return AggregateSentiment.getShortkeys(); + } + } diff --git a/src/main/java/sirocco/model/summary/ContentIndexSummary.java b/src/main/java/sirocco/model/summary/ContentIndexSummary.java index b69ecab..76ebf74 100644 --- a/src/main/java/sirocco/model/summary/ContentIndexSummary.java +++ b/src/main/java/sirocco/model/summary/ContentIndexSummary.java @@ -26,6 +26,9 @@ package sirocco.model.summary; + +import java.util.Arrays; + import CS2JNet.System.Collections.LCC.CSList; import sirocco.indexer.IndexingConsts; import sirocco.model.LabelledText; @@ -79,6 +82,58 @@ public void initialize(String url, Long publicationTime, Long processingTime, } } + + public ContentIndexSummary copy(){ + ContentIndexSummary result = new ContentIndexSummary(); + + // Create Document + result.doc = new Document(); + + DocumentTag[] dTags = new DocumentTag[this.doc.tags.length]; + for (int i = 0; i < this.doc.tags.length; i++) { + dTags[i] = new DocumentTag(); + dTags[i].initialize(this.doc.tags[i].tag, this.doc.tags[i].weight, this.doc.tags[i].goodAsTopic); + } + + result.doc.initialize(this.doc.documentHash, this.doc.publicationTime, this.doc.publicationDateId,this.doc.processingTime, + this.doc.processingDateId, this.doc.documentCollectionId, this.doc.collectionItemId, + this.doc.title, this.doc.type, this.doc.language, this.doc.contentParseDepth, this.doc.contentLength, + this.doc.author, this.doc.text, dTags, this.doc.mainWebResourceHash, this.doc.parentWebResourceHash); + + // Create the Web Resource + result.wr = new WebResource(); + result.wr.initialize(this.wr.webResourceHash, this.wr.url, + this.wr.publicationTime, this.wr.publicationDateId, this.wr.processingTime, this.wr.processingDateId, + this.wr.documentHash, this.wr.documentCollectionId, this.wr.collectionItemId, + this.wr.title, this.wr.domain, this.wr.author, this.wr.parentWebResourceHash); + + // Create Sentiments + result.sentiments = new Sentiment[this.sentiments.length]; + for (int i = 0; i < this.sentiments.length; i++) { + result.sentiments[i] = new Sentiment(); + + SentimentTag[] sTags = new SentimentTag[this.sentiments[i].tags.length]; + for (int j = 0; j < this.sentiments[i].tags.length; j++) { + sTags[j] = new SentimentTag(); + sTags[j].initialize(this.sentiments[i].tags[j].tag, this.sentiments[i].tags[j].goodAsTopic); + } + + String[] sigarray = Arrays.copyOf(this.sentiments[i].signals, this.sentiments[i].signals.length); + + result.sentiments[i].initialize(this.sentiments[i].sentimentHash, this.sentiments[i].documentHash, this.sentiments[i].documentTime, this.sentiments[i].documentDateId, + this.sentiments[i].text, this.sentiments[i].labelledPositions, this.sentiments[i].annotatedText, this.sentiments[i].annotatedHtmlText, + this.sentiments[i].sentimentTotalScore, this.sentiments[i].dominantValence, + this.sentiments[i].stAcceptance, this.sentiments[i].stAnger, this.sentiments[i].stAnticipation, this.sentiments[i].stAmbiguous, this.sentiments[i].stDisgust, + this.sentiments[i].stFear, this.sentiments[i].stGuilt, this.sentiments[i].stInterest, this.sentiments[i].stJoy, this.sentiments[i].stSadness, this.sentiments[i].stShame, + this.sentiments[i].stSurprise, this.sentiments[i].stPositive, this.sentiments[i].stNegative, this.sentiments[i].stSentiment, this.sentiments[i].stProfane, this.sentiments[i].stUnsafe, + this.sentiments[i].mainWebResourceHash, this.sentiments[i].parentWebResourceHash, + sTags, sigarray); + } + + + return result; + + } } diff --git a/src/main/java/sirocco/model/summary/Document.java b/src/main/java/sirocco/model/summary/Document.java index f202ddb..fe245cf 100644 --- a/src/main/java/sirocco/model/summary/Document.java +++ b/src/main/java/sirocco/model/summary/Document.java @@ -69,7 +69,7 @@ public void initialize(String documentHash, Long publicationTime, Integer public String title, IndexingConsts.ContentType type, String language, IndexingConsts.ParseDepth contentParseDepth, Integer contentLength, String author, String text, - DocumentTag[] tags) { + DocumentTag[] tags, String mainWebResourceHash, String parentWebResourceHash ) { this.documentHash = documentHash; this.publicationTime = publicationTime; this.publicationDateId = publicationDateId; @@ -115,7 +115,7 @@ public void initialize(Long publicationTime, Long processingTime, this.initialize(documentHash, publicationTime, publicationDateId, processingTime, processingDateId, documentCollectionId, collectionItemId, - title, contentType, language, contentParseDepth, contentLength, author, text, dTags); + title, contentType, language, contentParseDepth, contentLength, author, text, dTags, null, null); } diff --git a/src/main/java/sirocco/model/summary/Sentiment.java b/src/main/java/sirocco/model/summary/Sentiment.java index 2bb6662..8d81c37 100644 --- a/src/main/java/sirocco/model/summary/Sentiment.java +++ b/src/main/java/sirocco/model/summary/Sentiment.java @@ -75,9 +75,53 @@ public class Sentiment @Nullable public String parentWebResourceHash; @Nullable public SentimentTag[] tags; + @Nullable public String[] signals; // Signals are shortkeys of idioms, emotions etc. - public Sentiment() { - } + public Sentiment() { } + + public void initialize(String sentimentHash, String documentHash, Long documentTime, Integer documentDateId, + String text, String labelledPositions, String annotatedText, String annotatedHtmlText, + Integer sentimentTotalScore, IndexingConsts.SentimentValence dominantValence, + Integer stAcceptance, Integer stAnger, Integer stAnticipation, Integer stAmbiguous, Integer stDisgust, + Integer stFear, Integer stGuilt, Integer stInterest, Integer stJoy, Integer stSadness, Integer stShame, + Integer stSurprise, Integer stPositive, Integer stNegative, Integer stSentiment, Integer stProfane, Integer stUnsafe, + String mainWebResourceHash, String parentWebResourceHash, + SentimentTag[] tags, String[] signals) + { + this.sentimentHash = sentimentHash; + this.documentHash = documentHash; + this.documentTime = documentTime; + this.documentDateId = documentDateId; + this.text = text; + this.labelledPositions = labelledPositions; + this.annotatedText = annotatedText; + this.annotatedHtmlText = annotatedHtmlText; + this.sentimentTotalScore = sentimentTotalScore; + this.dominantValence = dominantValence; + this.stAcceptance = stAcceptance; + this.stAnger = stAnger; + this.stAnticipation = stAnticipation; + this.stAmbiguous = stAmbiguous; + this.stDisgust = stDisgust; + this.stFear = stFear; + this.stGuilt = stGuilt; + this.stInterest = stInterest; + this.stJoy = stJoy ; + this.stSadness = stSadness; + this.stShame = stShame; + this.stSurprise = stSurprise; + this.stPositive = stPositive ; + this.stNegative = stNegative; + this.stSentiment = stSentiment; + this.stProfane = stProfane; + this.stUnsafe = stUnsafe; + this.mainWebResourceHash = mainWebResourceHash; + this.parentWebResourceHash = parentWebResourceHash; + this.tags = tags; + this.signals = signals; + + } + public void initialize(String documentHash, Long documentTime, Integer documentDateId, LabelledText lt, TextTag[] wrTags, String mainWebResourceHash, String parentWebResourceHash) { @@ -141,6 +185,13 @@ public void initialize(String documentHash, Long documentTime, Integer documentD } } } + + this.signals = new String[lt.getContainedSignalShortkeys().size()]; + try { + lt.getContainedSignalShortkeys().copyTo(this.signals, 0); + } catch (Exception e) { + // TODO: report to LOGS + } } diff --git a/src/main/java/sirocco/util/HashUtils.java b/src/main/java/sirocco/util/HashUtils.java index 7cf6ac5..04796e0 100644 --- a/src/main/java/sirocco/util/HashUtils.java +++ b/src/main/java/sirocco/util/HashUtils.java @@ -72,17 +72,17 @@ public static String getSHA1HashBase64(String value) { /** - * Calculates base64(sha1(value)) - * This string is URL and JSON safe but 35% shorter than the simple SHA1 - * + * Calculates a text key 8 characters or shorter. Resulting key is URL and JSON safe. + * Current formula is substr(base64(sha256(value)),1,8), but in the future it + * might start returning the actual input string if it is shorter than 8 chars. + * * @param value - string to be hashed - * @return A 21 byte string in base 64. + * @return A string in base 64, 8 chars or shorter. */ - public static String getShortHash(String value) { - byte[] hash = DigestUtils.sha1(value); - byte[] shorterhash = new byte[hash.length]; - - return IdConverterUtils.convertByteArrayToBase64String(hash,null); + public static String getShortkey(String value) { + byte[] hash = DigestUtils.sha256(value); + String hashString = IdConverterUtils.convertByteArrayToBase64String(hash,null); + return hashString.substring(0, 8); } diff --git a/src/test/scripts/rundiff.sh b/src/test/scripts/rundiff.sh new file mode 100644 index 0000000..6f489a0 --- /dev/null +++ b/src/test/scripts/rundiff.sh @@ -0,0 +1,22 @@ + + +if [ $# != 2 ] +then + echo "Invalid number of parameters. Need 2: dir1 dir2" + exit 1 +else + DIR1=$1 + DIR2=$2 +fi + +for f in $(find $DIR1 -name '[^.]*.txt'); do + FILENAME=$(basename $f) + UPPER_DIR=$(basename $(dirname $f)) + NEWNAME=$DIR2"/"$UPPER_DIR"/"$FILENAME + #echo $FILENAME $UPPER_DIR $NEWNAME + + opendiff $f $NEWNAME + +done + + diff --git a/src/test/scripts/runindexer.sh b/src/test/scripts/runindexer.sh index 1b9255b..7359bd1 100644 --- a/src/test/scripts/runindexer.sh +++ b/src/test/scripts/runindexer.sh @@ -1,4 +1,16 @@ + +if [ $# -eq 0 ] +then + INDEXINGTYPE="FULLINDEX" +elif [ "$1" != "FULLINDEX" ] && [ "$1" != "TOPSENTIMENTS" ] +then + echo "Invalid parameter value. Use one of {FULLINDEX | TOPSENTIMENTS}" + exit 1 +else + INDEXINGTYPE=$1 +fi + MAVEN_OPTS="-Xmx2g -Xss4m" INPUT_DIR='./src/test/resources/in' @@ -11,7 +23,7 @@ for f in $(find $INPUT_DIR -name '[^.]*.txt'); do #echo $FILENAME $UPPER_DIR $NEWNAME mvn exec:java \ -Dexec.mainClass=sirocco.cmdline.CLI \ - -Dexec.args="Indexer -inputFile \"$f\" -outputFile \"$NEWNAME\"" + -Dexec.args="Indexer -inputFile \"$f\" -outputFile \"$NEWNAME\" -indexingType \"$INDEXINGTYPE\"" done