Skip to content

Commit

Permalink
Release 1.0.7:
Browse files Browse the repository at this point in the history
- Added MetaFields field to the output of Indexer, where clients can pass metadata to the indexer, and this metadata will be passed through unchanged into the output and stored in WebResource
- In Document, changed hash algorithm to run a hash over collectionItemId, if the text field is empty (useful when a document is empty, but has a unique ID)
  • Loading branch information
datancoffee committed Feb 4, 2018
1 parent da4a600 commit 607eea4
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 15 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ Sirocco relies on [Apache OpenNLP](https://opennlp.apache.org/) to supply it wit
Strata NYC 2017 keynote
[Emotional arithmetic: How machine learning helps you understand customers in real time](https://conferences.oreilly.com/strata/strata-ny/public/schedule/detail/63895)
by Chad Jennings from Google showed the results of emotion analysis performed by Sirocco
[Video Recording](https://www.oreilly.com/ideas/emotional-arithmetic-how-machine-learning-helps-you-understand-customers-in-real-time)

Sirocco was featured in the Strata NYC 2017 deep dive
[Emotional arithmetic: A deep dive into how machine learning and big data help you understand customers in real time](https://conferences.oreilly.com/strata/strata-ny/public/schedule/detail/63620) by Chad Jennings (Google) and Eric Schmidt (Google)
[Video Recording](https://www.safaribooksonline.com/library/view/strata-data-conference/9781491976326/video314135.html)

## Roadmap

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>sirocco.sirocco-sa</groupId>
<artifactId>sirocco-sa</artifactId>
<version>1.0.6</version>
<version>1.0.7</version>
<packaging>jar</packaging>
<name>Sirocco Sentiment Analysis</name>
<description>The sentiment analysis SDK</description>
Expand Down
20 changes: 19 additions & 1 deletion src/main/java/sirocco/indexer/Indexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,39 @@
package sirocco.indexer;

import CS2JNet.System.StringSupport;
import CS2JNet.System.Collections.LCC.CSList;
import sirocco.indexer.EnglishIndexerPool;
import sirocco.indexer.Language;
import sirocco.indexer.NonEnglishIndexerPool;
import sirocco.indexer.util.LangUtils;
import sirocco.model.ContentIndex;
import sirocco.model.LabelledText;
import sirocco.model.TextTag;

public class Indexer
{
/**
* Generates NLP info
*/
public static void index(ContentIndex contentindex) throws Exception {

String validationMessage = contentindex.validateInputFields();
if (validationMessage != null) {
contentindex.IndexingErrors = validationMessage;
contentindex.IsIndexingSuccessful = false;
return;
}

if (contentindex.OriginalText == null || contentindex.OriginalText.isEmpty()) {
// No need to run through full indexing. In the previous step we validated that other important fields are available
contentindex.IsIndexingSuccessful = true;
contentindex.populateResultsWithMinValues();
return;
}

// need to split into clean paragraphs before determining language
String[] paragraphs = StructuredSplitter.splitIntoParagraphs(contentindex.OriginalText,contentindex.ContentType);
if (paragraphs.length == 0)
if (paragraphs.length == 0)
return ;

contentindex.initializeParagraphs(paragraphs);
Expand Down
31 changes: 28 additions & 3 deletions src/main/java/sirocco/model/ContentIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import sirocco.annotators.ExtendedLogTextAnnotator;
import sirocco.indexer.FloatVector;
import sirocco.indexer.IndexingConsts;
import sirocco.indexer.Language;
import sirocco.indexer.util.LangUtils;
import sirocco.model.summary.ContentIndexSummary;

Expand Down Expand Up @@ -95,6 +96,8 @@ public class ContentIndex

public Long ParentPubTime;

public String[] MetaFields;

/****************/
/* Intermediate results of indexing */
public String Language;
Expand All @@ -121,14 +124,14 @@ public ContentIndex() {}

public ContentIndex(String content, IndexingConsts.IndexingType indexingType,
IndexingConsts.ContentType cueType, Long processingTime) {
this(content, indexingType, cueType,processingTime, null,null,null,null, null, null, null, null);
this(content, indexingType, cueType,processingTime, null,null,null,null, null, null, null, null, null);
}

public ContentIndex(String content, IndexingConsts.IndexingType indexingType,
IndexingConsts.ContentType cueType, Long processingTime,
String url, Long publicationTime, String title, String author,
String documentCollectionId, String collectionItemId,
String parentUrl, Long parentPubTime) {
String parentUrl, Long parentPubTime, String[] metaFields) {
this.OriginalText = content;
this.IndexingType = indexingType;
this.ContentType = cueType;
Expand All @@ -141,8 +144,30 @@ public ContentIndex(String content, IndexingConsts.IndexingType indexingType,
this.CollectionItemId = collectionItemId;
this.ParentUrl = parentUrl;
this.ParentPubTime = parentPubTime;
this.MetaFields = metaFields;
}

/**
* Will validate the input fields set in the constructors
* @return Error message or null String, if no error
*/
public String validateInputFields() {
String result = null;
if (this.OriginalText == null || this.OriginalText.isEmpty()) {
if ((this.DocumentCollectionId == null || this.DocumentCollectionId.isEmpty()) ||
(this.CollectionItemId == null || this.CollectionItemId.isEmpty()) ) {
result = "Null or Empty text and collection document id";
}
}
return result;
}

public void populateResultsWithMinValues() {
this.Language = sirocco.indexer.Language.Undetermined;
this.TopTags = new TextTag[0];
this.SelectedSentiments = new CSList<LabelledText>();
}

/**
* Should be called on the final content index, after the .index operation has been executed.
* @return The final content index with just the important text stats
Expand All @@ -154,7 +179,7 @@ public ContentIndexSummary getContentIndexSummary()
this.DocumentCollectionId, this.CollectionItemId,
this.Title, this.Author,
this.OriginalText, this.ContentType, this.ContentParseDepth, this.Language, this.TopTags,
this.SelectedSentiments, this.ParentUrl, this.ParentPubTime);
this.SelectedSentiments, this.ParentUrl, this.ParentPubTime, this.MetaFields);

return summary;
}
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/sirocco/model/summary/ContentIndexSummary.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public void initialize(String url, Long publicationTime, Long processingTime,
String documentCollectionId, String collectionItemId,
String title, String author, String text,
IndexingConsts.ContentType contentType, IndexingConsts.ParseDepth contentParseDepth, String language,
TextTag[] topTags, CSList<LabelledText> topSentiments, String parentUrl, Long parentPubTime) {
TextTag[] topTags, CSList<LabelledText> topSentiments, String parentUrl, Long parentPubTime, String[] metaFields) {

// calculate the Parent Web Resource Hash, if available
String parentWebResourceHash = ((parentUrl != null && parentPubTime != null)) ?
Expand All @@ -66,7 +66,7 @@ public void initialize(String url, Long publicationTime, Long processingTime,
this.wr = new WebResource();
this.wr.initialize(url, publicationTime, processingTime,
this.doc.documentHash, documentCollectionId, collectionItemId,
title, author, parentWebResourceHash);
title, author, parentWebResourceHash, metaFields);

// Adjust the document record
this.doc.mainWebResourceHash = this.wr.webResourceHash;
Expand Down Expand Up @@ -102,10 +102,11 @@ public ContentIndexSummary copy(){

// Create the Web Resource
result.wr = new WebResource();
String[] metaFields = Arrays.copyOf(this.wr.metaFields, this.wr.metaFields.length);
result.wr.initialize(this.wr.webResourceHash, this.wr.url,
this.wr.publicationTime, this.wr.publicationDateId, this.wr.processingTime, this.wr.processingDateId,
this.wr.documentHash, this.wr.documentCollectionId, this.wr.collectionItemId,
this.wr.title, this.wr.domain, this.wr.author, this.wr.parentWebResourceHash);
this.wr.title, this.wr.domain, this.wr.author, this.wr.parentWebResourceHash, metaFields);

// Create Sentiments
result.sentiments = new Sentiment[this.sentiments.length];
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/sirocco/model/summary/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,24 @@ public void initialize(Long publicationTime, Long processingTime,
Integer processingDateId = IdConverterUtils.getDateIdFromTimestamp(processingTime);

// Determine the ID of Document
String documentHash = calculateDocumentHash(text);
String documentHash = calculateDocumentHash(text, documentCollectionId, collectionItemId);

this.initialize(documentHash, publicationTime, publicationDateId, processingTime, processingDateId,
documentCollectionId, collectionItemId,
title, contentType, language, contentParseDepth, contentLength, author, text, dTags, null, null);

}

public static String calculateDocumentHash(String text) {
public static String calculateDocumentHash(String text, String documentCollectionId, String collectionItemId) {
// TODO: implement a more sophisticated fingerprinting algorithm
// that strips the text of leading and trailing copyright and location
return HashUtils.getSHA1HashBase64(text);

String result = ((text !=null)) ?
HashUtils.getSHA1HashBase64(text) :
((documentCollectionId != null) && (collectionItemId !=null)) ?
HashUtils.getSHA1HashBase64(documentCollectionId + collectionItemId) :
null;
return result;
}


Expand Down
9 changes: 5 additions & 4 deletions src/main/java/sirocco/model/summary/WebResource.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ public class WebResource
@Nullable public String domain; // domain of URL
@Nullable public String author; // Person who claims authorship of the resource, if we can determine it
@Nullable public String parentWebResourceHash; // In threaded conversations, the parent is the previous comment, email, or document

@Nullable public String[] metaFields; // Source-specific metadata fields

public WebResource() {}

public void initialize(String webResourceHash, String url,
Long publicationTime, Integer publicationDateId, Long processingTime, Integer processingDateId,
String documentHash, String documentCollectionId, String collectionItemId,
String title, String domain, String author, String parentWebResourceHash) {
String title, String domain, String author, String parentWebResourceHash, String[] metaFields ) {
this.webResourceHash = webResourceHash;
this.url = url;
this.publicationTime = publicationTime;
Expand All @@ -78,13 +78,14 @@ public void initialize(String webResourceHash, String url,
this.domain = domain;
this.author = author;
this.parentWebResourceHash = parentWebResourceHash;
this.metaFields = metaFields;

}

public void initialize(String url,
Long publicationTime, Long processingTime,
String documentHash, String documentCollectionId, String collectionItemId,
String title, String author, String parentWebResourceHash) {
String title, String author, String parentWebResourceHash, String[] metaFields) {

// Determine publication time and Date Ids
if (publicationTime == null)
Expand All @@ -104,7 +105,7 @@ public void initialize(String url,
this.initialize(webResourceHash, url,
publicationTime, publicationDateId, processingTime, processingDateId,
documentHash, documentCollectionId, collectionItemId,
title, domain, author, parentWebResourceHash);
title, domain, author, parentWebResourceHash, metaFields);

}

Expand Down

0 comments on commit 607eea4

Please sign in to comment.