-
-
-
+
+
+
-
- Loading • Chargement • 装载 • Wird geladen
+
+
+
+
+
+
+
+ Loading • Chargement • 装载 • Wird geladen
+
-
diff --git a/sist2-vue/src/components/AnalyzedContentSpan.vue b/sist2-vue/src/components/AnalyzedContentSpan.vue
new file mode 100644
index 0000000..7201539
--- /dev/null
+++ b/sist2-vue/src/components/AnalyzedContentSpan.vue
@@ -0,0 +1,21 @@
+
+ {{span.text}}
+
+
+
+
+
diff --git a/sist2-vue/src/components/AnalyzedContentSpanContainer.vue b/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
new file mode 100644
index 0000000..997b83d
--- /dev/null
+++ b/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
@@ -0,0 +1,75 @@
+
+
+
+
+
+
+
diff --git a/sist2-vue/src/components/LazyContentDiv.vue b/sist2-vue/src/components/LazyContentDiv.vue
index 819d9df..6c21175 100644
--- a/sist2-vue/src/components/LazyContentDiv.vue
+++ b/sist2-vue/src/components/LazyContentDiv.vue
@@ -1,6 +1,36 @@
-
-
+
+
+
+
+ {{ $t("ml.auto") }}
+
+ {{ $t("ml.analyzeText") }}
+
+
+ {{ opt.text }}
+
+
+
+
+
+
+ {{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
+ (modelSize / (1024 * 1024)).toFixed(1)
+ }}MB
+
+
+
+
+
+
+
+
-
\ No newline at end of file
diff --git a/sist2-vue/src/i18n/messages.ts b/sist2-vue/src/i18n/messages.ts
index 653a8f3..9fe1308 100644
--- a/sist2-vue/src/i18n/messages.ts
+++ b/sist2-vue/src/i18n/messages.ts
@@ -49,6 +49,7 @@ export default {
configReset: "Reset configuration",
searchOptions: "Search options",
treemapOptions: "Treemap options",
+ mlOptions: "Machine learning options",
displayOptions: "Display options",
opt: {
lang: "Language",
@@ -78,7 +79,10 @@ export default {
simpleLightbox: "Disable animations in image viewer",
showTagPickerFilter: "Display the tag filter bar",
featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
- featuredFieldsList: "Available variables"
+ featuredFieldsList: "Available variables",
+ autoAnalyze: "Automatically analyze text",
+ defaultModel: "Default model",
+ mlRepositories: "Model repositories (one per line)"
},
queryMode: {
simple: "Simple",
@@ -171,6 +175,12 @@ export default {
selectedIndex: "selected index",
selectedIndices: "selected indices",
},
+ ml: {
+ analyzeText: "Analyze",
+ auto: "Auto",
+ repoFetchError: "Failed to get list of models. Check browser console for more details.",
+ repoFetchErrorTitle: "Could not fetch model repositories",
+ }
},
de: {
filePage: {
diff --git a/sist2-vue/src/ml/BertNerModel.js b/sist2-vue/src/ml/BertNerModel.js
new file mode 100644
index 0000000..3183336
--- /dev/null
+++ b/sist2-vue/src/ml/BertNerModel.js
@@ -0,0 +1,77 @@
+import BertTokenizer from "@/ml/BertTokenizer";
+import * as tf from "@tensorflow/tfjs";
+import axios from "axios";
+
+export default class BertNerModel {
+ vocabUrl;
+ modelUrl;
+
+ id2label;
+ _tokenizer;
+ _model;
+ inputSize = 128;
+
+ _previousWordId = null;
+
+ constructor(vocabUrl, modelUrl, id2label) {
+ this.vocabUrl = vocabUrl;
+ this.modelUrl = modelUrl;
+ this.id2label = id2label;
+ }
+
+ async init(onProgress) {
+ await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
+ }
+
+ async loadTokenizer() {
+ const vocab = (await axios.get(this.vocabUrl)).data;
+ this._tokenizer = new BertTokenizer(vocab);
+ }
+
+ async loadModel(onProgress) {
+ this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
+ }
+
+ alignLabels(labels, wordIds, words) {
+ const result = [];
+
+ for (let i = 0; i < this.inputSize; i++) {
+ const label = labels[i];
+ const wordId = wordIds[i];
+
+ if (wordId === -1) {
+ continue;
+ }
+ if (wordId === this._previousWordId) {
+ continue;
+ }
+
+ result.push({
+ word: words[wordId].text, wordIndex: words[wordId].index, label: label
+ });
+ this._previousWordId = wordId;
+ }
+
+ return result;
+ }
+
+ async predict(text, callback) {
+ this._previousWordId = null;
+ const encoded = this._tokenizer.encodeText(text, this.inputSize)
+
+ for (let chunk of encoded.inputChunks) {
+ const rawResult = tf.tidy(() => this._model.execute({
+ input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
+ token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
+ attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
+ }));
+
+ const labelIds = await tf.argMax(rawResult, -1);
+ const labelIdsArray = await labelIds.array();
+ const labels = labelIdsArray[0].map(id => this.id2label[id]);
+ rawResult.dispose()
+
+ callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
+ }
+ }
+}
\ No newline at end of file
diff --git a/sist2-vue/src/ml/BertTokenizer.js b/sist2-vue/src/ml/BertTokenizer.js
new file mode 100644
index 0000000..0ea24ad
--- /dev/null
+++ b/sist2-vue/src/ml/BertTokenizer.js
@@ -0,0 +1,184 @@
+import {zip, chunk} from "underscore";
+
+const UNK_INDEX = 100;
+const CLS_INDEX = 101;
+const SEP_INDEX = 102;
+const CONTINUING_SUBWORD_PREFIX = "##";
+
+function isWhitespace(ch) {
+ return /\s/.test(ch);
+}
+
+function isInvalid(ch) {
+ return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
+}
+
+const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
+
+/** To judge whether it's a punctuation. */
+function isPunctuation(ch) {
+ return punctuations.indexOf(ch) !== -1;
+}
+
+export default class BertTokenizer {
+ vocab;
+
+ constructor(vocab) {
+ this.vocab = vocab;
+ }
+
+ tokenize(text) {
+ const charOriginalIndex = [];
+ const cleanedText = this.cleanText(text, charOriginalIndex);
+ const origTokens = cleanedText.split(' ');
+
+ let charCount = 0;
+ const tokens = origTokens.map((token) => {
+ token = token.toLowerCase();
+ const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
+ charCount += token.length + 1;
+ return tokens;
+ });
+
+ let flattenTokens = [];
+ for (let index = 0; index < tokens.length; index++) {
+ flattenTokens = flattenTokens.concat(tokens[index]);
+ }
+ return flattenTokens;
+ }
+
+ /* Performs invalid character removal and whitespace cleanup on text. */
+ cleanText(text, charOriginalIndex) {
+ text = text.replace(/\?/g, "").trim();
+
+ const stringBuilder = [];
+ let originalCharIndex = 0;
+ let newCharIndex = 0;
+
+ for (const ch of text) {
+ // Skip the characters that cannot be used.
+ if (isInvalid(ch)) {
+ originalCharIndex += ch.length;
+ continue;
+ }
+ if (isWhitespace(ch)) {
+ if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
+ stringBuilder.push(' ');
+ charOriginalIndex[newCharIndex] = originalCharIndex;
+ originalCharIndex += ch.length;
+ } else {
+ originalCharIndex += ch.length;
+ continue;
+ }
+ } else {
+ stringBuilder.push(ch);
+ charOriginalIndex[newCharIndex] = originalCharIndex;
+ originalCharIndex += ch.length;
+ }
+ newCharIndex++;
+ }
+ return stringBuilder.join('');
+ }
+
+ /* Splits punctuation on a piece of text. */
+ runSplitOnPunctuation(text, count, charOriginalIndex) {
+ const tokens = [];
+ let startNewWord = true;
+ for (const ch of text) {
+ if (isPunctuation(ch)) {
+ tokens.push({text: ch, index: charOriginalIndex[count]});
+ count += ch.length;
+ startNewWord = true;
+ } else {
+ if (startNewWord) {
+ tokens.push({text: '', index: charOriginalIndex[count]});
+ startNewWord = false;
+ }
+ tokens[tokens.length - 1].text += ch;
+ count += ch.length;
+ }
+ }
+ return tokens;
+ }
+
+ encode(words) {
+ let outputTokens = [];
+ const wordIds = [];
+
+ for (let i = 0; i < words.length; i++) {
+ let chars = [...words[i].text];
+
+ let isUnknown = false;
+ let start = 0;
+ let subTokens = [];
+
+ while (start < chars.length) {
+ let end = chars.length;
+ let currentSubstring = null;
+ while (start < end) {
+ let substr = chars.slice(start, end).join('');
+
+ if (start > 0) {
+ substr = CONTINUING_SUBWORD_PREFIX + substr;
+ }
+ if (this.vocab.includes(substr)) {
+ currentSubstring = this.vocab.indexOf(substr);
+ break;
+ }
+
+ --end;
+ }
+ if (currentSubstring == null) {
+ isUnknown = true;
+ break;
+ }
+ subTokens.push(currentSubstring);
+ start = end;
+ }
+
+ if (isUnknown) {
+ outputTokens.push(UNK_INDEX);
+ wordIds.push(i);
+ } else {
+ subTokens.forEach(tok => {
+ outputTokens.push(tok);
+ wordIds.push(i)
+ });
+ }
+ }
+
+ return {tokens: outputTokens, wordIds};
+ }
+
+ encodeText(inputText, inputSize) {
+
+ const tokenized = this.tokenize(inputText);
+ const encoded = this.encode(tokenized);
+
+ const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
+ const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
+
+ const chunks = [];
+
+ zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
+ const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
+ const segmentIds = Array(inputIds.length).fill(0);
+ const inputMask = Array(inputIds.length).fill(1);
+ wordIds = [-1, ...wordIds, -1];
+
+ while (inputIds.length < inputSize) {
+ inputIds.push(0);
+ inputMask.push(0);
+ segmentIds.push(0);
+ wordIds.push(-1);
+ }
+
+ chunks.push({inputIds, inputMask, segmentIds, wordIds})
+ });
+
+ return {
+ inputChunks: chunks,
+ words: tokenized
+ };
+ }
+}
\ No newline at end of file
diff --git a/sist2-vue/src/ml/modelsRepo.js b/sist2-vue/src/ml/modelsRepo.js
new file mode 100644
index 0000000..16edb2e
--- /dev/null
+++ b/sist2-vue/src/ml/modelsRepo.js
@@ -0,0 +1,43 @@
+import axios from "axios";
+
+class ModelsRepo {
+ _repositories;
+ data = {};
+
+ async init(repositories) {
+ this._repositories = repositories;
+
+ const data = await Promise.all(this._repositories.map(this._loadRepository));
+
+ data.forEach(models => {
+ models.forEach(model => {
+ this.data[model.name] = model;
+ })
+ });
+ }
+
+ async _loadRepository(repository) {
+ const data = (await axios.get(repository)).data;
+ data.forEach(model => {
+ model["modelUrl"] = new URL(model["modelPath"], repository).href;
+ model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
+ });
+ return data;
+ }
+
+ getOptions() {
+ return Object.values(this.data).map(model => ({
+ text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
+ value: model.name
+ }));
+ }
+
+ getDefaultModel() {
+ if (Object.values(this.data).length === 0) {
+ return null;
+ }
+ return Object.values(this.data).find(model => model.default).name;
+ }
+}
+
+export default new ModelsRepo();
\ No newline at end of file
diff --git a/sist2-vue/src/store/index.ts b/sist2-vue/src/store/index.ts
index e095070..37a06ca 100644
--- a/sist2-vue/src/store/index.ts
+++ b/sist2-vue/src/store/index.ts
@@ -57,6 +57,9 @@ export default new Vuex.Store({
optVidPreviewInterval: 700,
optSimpleLightbox: true,
optShowTagPickerFilter: true,
+ optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
+ optAutoAnalyze: false,
+ optMlDefaultModel: null,
_onLoadSelectedIndices: [] as string[],
_onLoadSelectedMimeTypes: [] as string[],
@@ -86,7 +89,11 @@ export default new Vuex.Store({
uiMimeMap: [] as any[],
- auth0Token: null
+ auth0Token: null,
+ mlModel: {
+ model: null,
+ name: null
+ },
},
mutations: {
setUiShowDetails: (state, val) => state.uiShowDetails = val,
@@ -172,6 +179,9 @@ export default new Vuex.Store({
setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
+ setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
+ setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
+ setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
@@ -194,6 +204,7 @@ export default new Vuex.Store({
// noop
},
setAuth0Token: (state, val) => state.auth0Token = val,
+ setMlModel: (state, val) => state.mlModel = val,
},
actions: {
setSist2Info: (store, val) => {
@@ -350,6 +361,7 @@ export default new Vuex.Store({
},
modules: {},
getters: {
+ mlModel: (state) => state.mlModel,
seed: (state) => state.seed,
getPathText: (state) => state.pathText,
indices: state => state.indices,
@@ -416,5 +428,12 @@ export default new Vuex.Store({
optSimpleLightbox: state => state.optSimpleLightbox,
optShowTagPickerFilter: state => state.optShowTagPickerFilter,
optFeaturedFields: state => state.optFeaturedFields,
+ optMlRepositories: state => state.optMlRepositories,
+ mlRepositoryList: state => {
+ const repos = state.optMlRepositories.split("\n")
+ return repos[0] == "" ? [] : repos;
+ },
+ optMlDefaultModel: state => state.optMlDefaultModel,
+ optAutoAnalyze: state => state.optAutoAnalyze,
}
})
\ No newline at end of file
diff --git a/sist2-vue/src/views/Configuration.vue b/sist2-vue/src/views/Configuration.vue
index 80dc2e0..84e20ff 100644
--- a/sist2-vue/src/views/Configuration.vue
+++ b/sist2-vue/src/views/Configuration.vue
@@ -1,202 +1,218 @@
-
-
-
-
-
- {{ $t("config") }}
-
- {{ $t("configDescription") }}
-
-
- {{ $t("displayOptions") }}
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- {{ $t("opt.lightboxLoadOnlyCurrent") }}
-
-
-
- {{ $t("opt.hideLegacy") }}
-
-
-
- {{ $t("opt.updateMimeMap") }}
-
-
-
- {{ $t("opt.useDatePicker") }}
-
-
- {{
- $t("opt.simpleLightbox")
- }}
-
-
- {{
- $t("opt.showTagPickerFilter")
- }}
-
-
-
-
-
-
- {{
- $t("opt.featuredFieldsList")
- }}
-
-
-
- doc.checksum
- doc.path
- doc.mime
- doc.videoc
- doc.audioc
- doc.pages
- doc.mtime
- doc.font_name
- doc.album
- doc.artist
- doc.title
- doc.genre
- doc.album_artist
- doc.exif_make
- doc.exif_model
- doc.exif_software
- doc.exif_exposure_time
- doc.exif_fnumber
- doc.exif_iso_speed_ratings
- doc.exif_focal_length
- doc.exif_user_comment
- doc.exif_user_comment
- doc.exif_gps_longitude_ref
- doc.exif_gps_longitude_dms
- doc.exif_gps_longitude_dec
- doc.exif_gps_latitude_ref
- doc.exif_gps_latitude_dec
- humanDate()
- humanFileSize()
-
-
- {{ $t("forExample") }}
-
-
- -
-
<b>${humanDate(doc.mtime)}</b> • ${doc.videoc || ''}
-
- -
-
${doc.pages ? (doc.pages + ' pages') : ''}
-
-
-
-
-
-
+
+
+ {{ $t("config") }}
+
+ {{ $t("configDescription") }}
+
+
+ {{ $t("displayOptions") }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ $t("opt.lightboxLoadOnlyCurrent") }}
+
+
+
+ {{ $t("opt.hideLegacy") }}
+
+
+
+ {{ $t("opt.updateMimeMap") }}
+
+
+
+ {{ $t("opt.useDatePicker") }}
+
+
+ {{
+ $t("opt.simpleLightbox")
+ }}
+
+
+ {{
+ $t("opt.showTagPickerFilter")
+ }}
+
+
+
+
+
+
+ {{
+ $t("opt.featuredFieldsList")
+ }}
+
+
+
+ doc.checksum
+ doc.path
+ doc.mime
+ doc.videoc
+ doc.audioc
+ doc.pages
+ doc.mtime
+ doc.font_name
+ doc.album
+ doc.artist
+ doc.title
+ doc.genre
+ doc.album_artist
+ doc.exif_make
+ doc.exif_model
+ doc.exif_software
+ doc.exif_exposure_time
+ doc.exif_fnumber
+ doc.exif_iso_speed_ratings
+ doc.exif_focal_length
+ doc.exif_user_comment
+ doc.exif_user_comment
+ doc.exif_gps_longitude_ref
+ doc.exif_gps_longitude_dms
+ doc.exif_gps_longitude_dec
+ doc.exif_gps_latitude_ref
+ doc.exif_gps_latitude_dec
+ humanDate()
+ humanFileSize()
+
+
+ {{ $t("forExample") }}
+
+
+ -
+
<b>${humanDate(doc.mtime)}</b> • ${doc.videoc || ''}
+
+ -
+
${doc.pages ? (doc.pages + ' pages') : ''}
+
+
+
+
+
+
+
+
+
+ {{ $t("searchOptions") }}
+
+ {{
+ $t("opt.hideDuplicates")
+ }}
+
+
+ {{
+ $t("opt.highlight")
+ }}
+
+ {{
+ $t("opt.tagOrOperator")
+ }}
+
+ {{ $t("opt.fuzzy") }}
+ {{
+ $t("opt.searchInPath")
+ }}
+
+ {{
+ $t("opt.suggestPath")
+ }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ $t("mlOptions") }}
+
+
+
+
+ {{
+ $t("opt.autoAnalyze")
+ }}
+
+
+
+ {{ $t("treemapOptions") }}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ $t("configReset") }}
+
-
-
{{ $t("searchOptions") }}
-
- {{
- $t("opt.hideDuplicates")
- }}
-
-
- {{ $t("opt.highlight") }}
- {{
- $t("opt.tagOrOperator")
- }}
-
- {{ $t("opt.fuzzy") }}
- {{
- $t("opt.searchInPath")
- }}
-
- {{
- $t("opt.suggestPath")
- }}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
- {{ $t("treemapOptions") }}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- {{ $t("configReset") }}
-
-
-
-
-
-
-
-
+
+
\ No newline at end of file
diff --git a/src/sist.h b/src/sist.h
index 6fb7e36..f660aa4 100644
--- a/src/sist.h
+++ b/src/sist.h
@@ -51,11 +51,11 @@
#include
#include "git_hash.h"
-#define VERSION "3.0.3"
+#define VERSION "3.0.4"
static const char *const Version = VERSION;
static const int VersionMajor = 3;
static const int VersionMinor = 0;
-static const int VersionPatch = 3;
+static const int VersionPatch = 4;
#ifndef SIST_PLATFORM
#define SIST_PLATFORM unknown