Gitlab Community Edition Instance

Commit f98cc77c authored by mhellka's avatar mhellka
Browse files

es-ingest: Skip files known to not contain any searchable text

parent 6fa207a5
......@@ -339,6 +339,9 @@ public class ElasticIngestJob implements AutoCloseable {
// Calculate doc.content, if needed and available
if (file.size == 0) {
doc.content = "";
} else if (!mayContainText(file)) {
doc.content = "";
doc.tags.add("not_analysed");
} else if (contentModified && contentAvailable) {
final ParseResult parsed = extractText(file);
if (parsed.isTruncated())
......@@ -367,6 +370,24 @@ public class ElasticIngestJob implements AutoCloseable {
}
}
private boolean mayContainText(FileInfo file) {
if (file.size == 0)
return false;
final String ftype = file.type != null ? file.type : "application/octet-stream";
// Whitelist
if (ftype.startsWith("text/"))
return true;
// Blacklist
if (ftype.startsWith("image/") || ftype.startsWith("video/") || ftype.startsWith("audio/"))
return false;
// default
return true;
}
private ParseResult extractText(FileInfo file) throws IOException {
final ParseResult parsed;
try (InputStream stream = cdstar.execute(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment