-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathFileClassifier.java
111 lines (88 loc) · 3.76 KB
/
FileClassifier.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package com.searchcode.app.util;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import com.searchcode.app.config.Values;
import com.searchcode.app.dto.FileClassifierResult;
import com.searchcode.app.service.Singleton;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class FileClassifier {
private String DATABASEPATH = Properties.getProperties().getProperty(Values.CLASSIFIER_DATABASE_LOCATION, Values.DEFAULT_CLASSIFIER_DATABASE_LOCATION);
private List<FileClassifierResult> database = new ArrayList<>();
public FileClassifier() {
this.database = this.loadDatabase();
}
public FileClassifier(List<FileClassifierResult> database) {
this.database = database;
}
public List<FileClassifierResult> getDatabase() {
return this.database;
}
public void setDatabase(List<FileClassifierResult> database) {
this.database = database;
}
/**
* Given a filename and the lines inside the file attempts to guess the type of the file.
* TODO When no match attempt to identify using the file keywords
*/
public String languageGuesser(String fileName, List<String> codeLines) {
String[] split = fileName.split("\\.");
String extension = split[split.length - 1].toLowerCase();
if ("txt".equals(extension)) {
return "Text";
}
// Find all languages that might be this one
Object[] matching = this.database.stream().filter(x -> ArrayUtils.contains(x.extensions, extension)).toArray();
if (matching.length == 0) {
// Check against all using the pattern and see if we can guess
return "Unknown";
}
if (matching.length == 1) {
return ((FileClassifierResult)matching[0]).language;
}
// More then one possible match, check which one is most likely is and return that
String languageGuess = "";
int bestKeywords = 0;
// This is hideous, need to look at performance at some point
// but should be acceptable for now since it only runs when we have
// multiple entries
for(Object c: matching) {
FileClassifierResult fileClassifierResult = (FileClassifierResult)c;
int matchingKeywords = 0;
for(String line: codeLines) {
for(String keyword: fileClassifierResult.keywords) {
matchingKeywords += StringUtils.countMatches(line, keyword);
}
}
if (matchingKeywords > bestKeywords) {
bestKeywords = matchingKeywords;
languageGuess = fileClassifierResult.language;
}
}
if (languageGuess == null || languageGuess.trim().equals(Values.EMPTYSTRING)) {
languageGuess = "Unknown";
}
return languageGuess;
}
/**
* Loads the File Classifier database from a JSON file on disk
*/
private ArrayList<FileClassifierResult> loadDatabase() {
ArrayList<FileClassifierResult> database = new ArrayList<>();
try {
Gson gson = new GsonBuilder().create();
FileClassifierResult[] myArray = gson.fromJson(new FileReader(this.DATABASEPATH), FileClassifierResult[].class);
database = new ArrayList<>(Arrays.asList(myArray));
}
catch (FileNotFoundException | JsonSyntaxException ex) {
Singleton.getLogger().warning("Unable to load '" + DATABASEPATH+ "' file. File classification will not work. " + ex.toString());
}
return database;
}
}