ANY23-415 avoid n-triples errors for humans.txt files, etc
authorHans <firedrake93@gmail.com>
Sat, 9 Feb 2019 10:19:07 +0000 (04:19 -0600)
committerHans <firedrake93@gmail.com>
Sat, 9 Feb 2019 10:19:07 +0000 (04:19 -0600)
core/src/main/java/org/apache/any23/extractor/ExtractionResultImpl.java
core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
core/src/test/java/org/apache/any23/Any23Test.java
test-resources/src/test/resources/rdf/issue415.txt [new file with mode: 0644]

index 7a2d123..636b230 100644 (file)
@@ -182,6 +182,10 @@ public class ExtractionResultImpl implements TagSoupExtractionResult {
         }
     }
 
+    boolean wasTouched() {
+        return isInitialized;
+    }
+
     @Override
     public void writeTriple(Resource s, IRI p, Value o) {
         writeTriple(s, p, o, null);
index e84ab61..ea5608c 100644 (file)
@@ -42,6 +42,7 @@ import org.apache.any23.writer.TripleHandlerException;
 import org.apache.any23.extractor.Extractor.BlindExtractor;
 import org.apache.any23.extractor.Extractor.ContentExtractor;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
+import org.apache.tika.mime.MimeTypes;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
@@ -59,6 +60,7 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
@@ -253,14 +255,21 @@ public class SingleDocumentExtraction {
             final String documentLanguage;
                try {
                    documentLanguage = extractDocumentLanguage(extractionParameters);
-                   for (ExtractorFactory<?> factory : matchingExtractors) {
-                       @SuppressWarnings("rawtypes")
-                       final Extractor extractor = factory.createExtractor();
+                Iterator<ExtractorFactory<?>> factories = matchingExtractors.iterator();
+                while (factories.hasNext()) {
+                       ExtractorFactory<?> factory = factories.next();
+                       final Extractor<?> extractor = factory.createExtractor();
                        final SingleExtractionReport er = runExtractor(
                                extractionParameters,
                                documentLanguage,
                                extractor
                        );
+                       // Fix for ANY23-415:
+                       if (!er.touched && detectedMIMEType != null && isTooGeneric(detectedMIMEType)
+                            && factory.getSupportedMIMETypes().stream().anyMatch(mt -> !isTooGeneric(mt))) {
+                           factories.remove();
+                           continue;
+                    }
                        resourceRoots.addAll( er.resourceRoots );
                        propertyPaths.addAll( er.propertyPaths );
                        extractorToIssues.put(factory.getExtractorName(), er.issues);
@@ -311,6 +320,16 @@ public class SingleDocumentExtraction {
         );
     }
 
+    private static boolean isTooGeneric(MIMEType type) {
+        if (type.isAnySubtype()) {
+            return true;
+        }
+        String mt = type.getFullType();
+        return mt.equals(MimeTypes.PLAIN_TEXT)
+                || mt.equals(MimeTypes.OCTET_STREAM)
+                || mt.equals(MimeTypes.XML);
+    }
+
     /**
      * Triggers the execution of all the {@link Extractor}
      * registered to this class using the <i>default</i> extraction parameters.
@@ -490,7 +509,8 @@ public class SingleDocumentExtraction {
                 new SingleExtractionReport(
                     extractionResult.getIssues(),
                     new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
-                    new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
+                    new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() ),
+                    extractionResult.wasTouched()
                 );
         } catch (ExtractionException ex) {
             if(log.isDebugEnabled()) {
@@ -866,19 +886,22 @@ public class SingleDocumentExtraction {
     /**
      * Entity detection report.
      */
-    private class SingleExtractionReport {
+    private static class SingleExtractionReport {
         private final Collection<IssueReport.Issue> issues;
         private final List<ResourceRoot>            resourceRoots;
         private final List<PropertyPath>            propertyPaths;
+        private final boolean touched;
 
         public SingleExtractionReport(
                 Collection<IssueReport.Issue>  issues,
                 List<ResourceRoot> resourceRoots,
-                List<PropertyPath> propertyPaths
+                List<PropertyPath> propertyPaths,
+                boolean wasTouched
         ) {
             this.issues        = issues;
             this.resourceRoots = resourceRoots;
             this.propertyPaths = propertyPaths;
+            this.touched = wasTouched;
         }
     }
 
index d1d3467..7666e1b 100644 (file)
@@ -17,6 +17,8 @@
 
 package org.apache.any23;
 
+import org.apache.any23.extractor.ExtractorGroup;
+import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
 import org.junit.Assert;
 import org.apache.any23.configuration.Configuration;
 import org.apache.any23.configuration.DefaultConfiguration;
@@ -64,6 +66,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
+import java.util.Collections;
 import java.util.List;
 
 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
@@ -540,6 +543,20 @@ public class Any23Test extends Any23OnlineTestBase {
                 n3.contains("http://vocab.sindice.net/size"));
     }
 
+    @Test
+    public void testIssue415() throws Exception {
+        NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
+        Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
+
+        ExtractionReport report = runner.extract(
+                IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
+                "http://humanstxt.org/humans.txt",
+                new CompositeTripleHandler());
+        Assert.assertEquals("text/plain", report.getDetectedMimeType());
+        Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
+        Assert.assertEquals(0, report.getMatchingExtractors().size());
+    }
+
     /**
      * Performs detection and extraction on the given input string and return
      * the {@link ExtractionReport}.
diff --git a/test-resources/src/test/resources/rdf/issue415.txt b/test-resources/src/test/resources/rdf/issue415.txt
new file mode 100644 (file)
index 0000000..7f20327
--- /dev/null
@@ -0,0 +1,91 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+/* This page borrowed from http://humanstxt.org/humans.txt to test ANY23-415 */
+
+/* TEAM */
+       Chef:Juanjo Bernabeu
+       Contact: hello [at] humanstxt.org
+       Twitter: @juanjobernabeu
+       From:Barcelona, Catalonia, Spain
+
+       UI developer: Maria Macias
+       Twitter: @maria_ux
+       From:Barcelona, Catalonia, Spain
+
+       One eyed illustrator: Carlos Mañas
+       Twitter: @oneeyedman
+       From:Madrid, Spain
+
+       Standard Man: Abel Cabans
+       Twitter: @abelcabans
+       From:Barcelona, Catalonia, Spain
+
+       Web designer: Abel Sutilo
+       Twitter: @abelsutilo
+       From:Sevilla, Andalucia, Spain
+
+/* THANKS */
+
+       (First) EN Translator: Jos Flores
+       Twitter: @prosciuttos
+       From: Barcelona, Catalonia, Spain
+
+       CA Translator: Eva AC
+       Twitter: @evaac
+       From:Barcelona, Catalonia, Spain
+
+       EN Translator: Marta Armada
+       Twitter: @martuishere
+       From: Barcelona, Catalonia, Spain
+
+       RU Translator: Alexey Bass
+       Twitter: @alexey_bass
+       Location: Israel, Netanya
+
+       RU Translator: Vladimir Epifanov
+       Twitter: @voldmar
+       Location: Moscow, Russia
+
+       NL Translator: Rowdy Rabouw
+       Twitter: @rowdyrabouw
+       Location: Gouda, The Netherlands
+
+       DE Translator: Dennis Fischer
+       Twitter: @ichderfisch
+       Location: Düsseldorf / Germany
+
+       CZ Translator: Daniel Kršiak
+       Twitter: @krsiakdaniel
+       Location: Czech Republic
+
+       ZH Translator: Ana Villalba
+       Location: Spain
+
+       JA Translator: Clémence Haure
+       Location: Spain
+
+       FR Translator: Thibaud Desodt
+       Location: Belgium
+
+       Media Queries by: Marta Armada (@martuishere) and Javier Usobiaga (@htmlboy)
+
+
+/* SITE */
+       Last update:2012/02/04
+       Language: Català / Czech / Deutsch / English / Castellano / Japanese / Dutch / Russian / Chinese
+       Doctype:HTML5
+       IDE: Sublime Text, Notepad++, FileZilla, Photoshop