ANY23-415 narrow generic mimetypes on successful triple extraction 136/head
authorHans <firedrake93@gmail.com>
Sat, 9 Feb 2019 21:27:28 +0000 (15:27 -0600)
committerHans <firedrake93@gmail.com>
Sat, 9 Feb 2019 21:27:28 +0000 (15:27 -0600)
core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
core/src/test/java/org/apache/any23/Any23Test.java
test-resources/src/test/resources/rdf/issue415-valid.txt [new file with mode: 0644]

index ea5608c..66022fe 100644 (file)
@@ -60,10 +60,10 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
+import java.util.stream.Collectors;
 
 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
@@ -255,9 +255,10 @@ public class SingleDocumentExtraction {
             final String documentLanguage;
                try {
                    documentLanguage = extractDocumentLanguage(extractionParameters);
-                Iterator<ExtractorFactory<?>> factories = matchingExtractors.iterator();
-                while (factories.hasNext()) {
-                       ExtractorFactory<?> factory = factories.next();
+                   ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
+                final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
+                ArrayList<String> intersectionOfRdfMimetypes = null;
+                for (ExtractorFactory<?> factory : matchingExtractors) {
                        final Extractor<?> extractor = factory.createExtractor();
                        final SingleExtractionReport er = runExtractor(
                                extractionParameters,
@@ -265,15 +266,42 @@ public class SingleDocumentExtraction {
                                extractor
                        );
                        // Fix for ANY23-415:
-                       if (!er.touched && detectedMIMEType != null && isTooGeneric(detectedMIMEType)
-                            && factory.getSupportedMIMETypes().stream().anyMatch(mt -> !isTooGeneric(mt))) {
-                           factories.remove();
-                           continue;
+                    if (mimeTypeIsTooGeneric) {
+                        List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
+                                .filter(mt -> !isTooGeneric(mt))
+                                .map(MIMEType::getFullType)
+                                .collect(Collectors.toList());
+                        if (er.touched) {
+                            // If detected mimetype is too generic, but we find extractors matching
+                            // this mimetype that are capable of producing RDF triples from this resource,
+                            // and these extractors are also associated with more specific RDF mimetypes,
+                            // then we can simply take the intersection of these more specific mimetypes
+                            // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
+                            if (intersectionOfRdfMimetypes == null) {
+                                intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
+                            } else {
+                                intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
+                            }
+                        } else if (!rdfMimetypes.isEmpty()) {
+                            // If detected mimetype is too generic, and this extractor matches both the
+                            // generic mimetype and a more specific mimetype, but did not produce any RDF
+                            // triples, then we can safely assume that this extractor is not actually a
+                            // match for the type of file we are parsing (e.g., a "humans.txt" file).
+                            continue;
+                        }
                     }
                        resourceRoots.addAll( er.resourceRoots );
                        propertyPaths.addAll( er.propertyPaths );
+                       filteredList.add(factory);
                        extractorToIssues.put(factory.getExtractorName(), er.issues);
                    }
+                matchingExtractors = new ExtractorGroup(filteredList);
+                if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
+                    // If the detected mimetype is a generic, non-RDF mimetype, and the intersection
+                    // of specific RDF mimetypes across all triple-producing extractors is non-empty,
+                    // simply replace the generic mimetype with a specific RDF mimetype in that intersection.
+                    detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
+                }
                } catch(ValidatorException ve) {
                    throw new ExtractionException("An error occurred during the validation phase.", ve);
                }
@@ -321,7 +349,7 @@ public class SingleDocumentExtraction {
     }
 
     private static boolean isTooGeneric(MIMEType type) {
-        if (type.isAnySubtype()) {
+        if (type == null || type.isAnySubtype()) {
             return true;
         }
         String mt = type.getFullType();
index 7666e1b..d3c73ea 100644 (file)
@@ -544,7 +544,7 @@ public class Any23Test extends Any23OnlineTestBase {
     }
 
     @Test
-    public void testIssue415() throws Exception {
+    public void testIssue415InvalidNTriples() throws Exception {
         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
 
@@ -557,6 +557,22 @@ public class Any23Test extends Any23OnlineTestBase {
         Assert.assertEquals(0, report.getMatchingExtractors().size());
     }
 
+    @Test
+    public void testIssue415ValidNTriples() throws Exception {
+        NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
+        Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
+
+        CountingTripleHandler handler = new CountingTripleHandler();
+        ExtractionReport report = runner.extract(
+                IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
+                "http://humanstxt.org/humans.txt",
+                handler);
+        Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
+        Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
+        Assert.assertEquals(1, report.getMatchingExtractors().size());
+        Assert.assertEquals(1, handler.getCount());
+    }
+
     /**
      * Performs detection and extraction on the given input string and return
      * the {@link ExtractionReport}.
diff --git a/test-resources/src/test/resources/rdf/issue415-valid.txt b/test-resources/src/test/resources/rdf/issue415-valid.txt
new file mode 100644 (file)
index 0000000..00c3ec4
--- /dev/null
@@ -0,0 +1,18 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+<http://example.com/subject> <http://example.com/predicate> <http://example.com/object> .