ANY23-169 Fixed url resolving errors in MicrodataExtractor
authorHans <firedrake93@gmail.com>
Fri, 13 Apr 2018 08:33:22 +0000 (03:33 -0500)
committerHans <firedrake93@gmail.com>
Fri, 13 Apr 2018 08:33:22 +0000 (03:33 -0500)
core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java
core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
pom.xml
test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads [new file with mode: 0644]
test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html [new file with mode: 0644]
test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads

index 513ffbb..3663800 100644 (file)
@@ -28,6 +28,7 @@ import org.apache.any23.extractor.html.DomUtils;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.DCTerms;
 import org.apache.any23.vocab.XHTML;
+import org.eclipse.rdf4j.common.net.ParsedIRI;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
@@ -39,8 +40,6 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.util.Date;
@@ -48,7 +47,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
+import java.util.Optional;
 import java.util.Set;
 
 /**
@@ -241,26 +240,14 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
         if (href == null) {
             return;
         }
-        URL absoluteURL;
-        if (!isAbsoluteURL(href.getTextContent())) {
-            try {
-                absoluteURL = toAbsoluteURL(
-                        documentIRI.toString(),
-                        href.getTextContent(),
-                        '/'
-                );
-            } catch (MalformedURLException e) {
-                // okay, it's not an absolute URL, return
-                return;
-            }
-        } else {
-            try {
-                absoluteURL = new URL(href.getTextContent());
-            } catch (MalformedURLException e) {
-                // cannot happen
-                return;
-            }
+        IRI iri;
+        try {
+            iri = toAbsoluteIRI(documentIRI, href.getTextContent());
+        } catch (URISyntaxException e) {
+            // cannot happen
+            return;
         }
+
         String[] relTokens = rel.getTextContent().split(" ");
         Set<String> tokensWithNoDuplicates = new HashSet<>();
         for (String relToken : relTokens) {
@@ -275,16 +262,11 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
             tokensWithNoDuplicates.add(relToken.toLowerCase());
         }
         for (String token : tokensWithNoDuplicates) {
-            IRI predicate;
-            if (isAbsoluteURL(token)) {
-                predicate = RDFUtils.iri(token);
-            } else {
-                predicate = RDFUtils.iri(XHTML.NS + token);
-            }
+            IRI predicate = toAbsoluteIRI(token).orElseGet(() -> RDFUtils.iri(XHTML.NS + token.trim()));
             out.writeTriple(
                     documentIRI,
                     predicate,
-                    RDFUtils.iri(absoluteURL.toString())
+                    iri
             );
         }
     }
@@ -304,9 +286,10 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
             String name    = DomUtils.readAttribute(meta, "name", null);
             String content = DomUtils.readAttribute(meta, "content", null);
             if (name != null && content != null) {
-                if (isAbsoluteURL(name)) {
+                Optional<IRI> nameIRI = toAbsoluteIRI(name);
+                if (nameIRI.isPresent()) {
                     processMetaElement(
-                            RDFUtils.iri(name),
+                            nameIRI.get(),
                             content,
                             getLanguage(meta),
                             documentIRI,
@@ -385,7 +368,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
         }
         out.writeTriple(
                 documentIRI,
-                RDFUtils.iri(XHTML.NS + name.toLowerCase()),
+                RDFUtils.iri(XHTML.NS + name.toLowerCase().trim()),
                 subject
         );
     }
@@ -455,7 +438,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
                             mappings,
                             out
                     );
-                } catch (MalformedURLException e) {
+                } catch (URISyntaxException e) {
                     throw new ExtractionException(
                             "Error while processing on subject '" + subject +
                                     "' the itemProp: '" + itemProp + "' "
@@ -472,17 +455,8 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
     }
 
     private static Resource createSubjectForItemId(String itemId) {
-        if (itemId != null) {
-            try {
-                URI uri = new URI(itemId.trim());
-                if (uri.isAbsolute()) {
-                    return RDFUtils.iri(uri.toString());
-                }
-            } catch (URISyntaxException e) {
-                //not an absolute uri
-            }
-        }
-        return RDFUtils.bnode();
+        Optional<IRI> iri = toAbsoluteIRI(itemId);
+        return iri.isPresent() ? iri.get() : RDFUtils.bnode();
     }
 
     private void processProperty(
@@ -493,7 +467,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
             IRI documentIRI,
             Map<ItemScope, Resource> mappings,
             ExtractionResult out
-    ) throws MalformedURLException, ExtractionException {
+    ) throws URISyntaxException, ExtractionException {
 
         IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName);
         if (predicate == null) {
@@ -508,10 +482,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
             value = RDFUtils.literal((String) propValue, documentLanguage);
         } else if (propType.equals(ItemPropValue.Type.Link)) {
-            value = RDFUtils.iri(toAbsoluteURL(
-                    documentIRI.toString(),
-                    (String) propValue,
-                    '/').toString());
+            value = toAbsoluteIRI(documentIRI, (String)propValue);
         } else if (propType.equals(ItemPropValue.Type.Date)) {
             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
         } else {
@@ -522,37 +493,37 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
     }
 
     private static IRI getPredicate(IRI itemType, String localName) {
-        if (isAbsoluteURL(localName)) {
-            return RDFUtils.iri(localName);
-        } else if (itemType != null) {
-            return RDFUtils.iri(itemType.getNamespace(), Objects.requireNonNull(localName));
-        } else {
-            return null;
+        return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? null :
+                RDFUtils.iri(itemType.getNamespace(), localName.trim()));
+    }
+
+    private static Optional<IRI> toAbsoluteIRI(String urlString) {
+        if (urlString != null) {
+            try {
+                ParsedIRI iri = ParsedIRI.create(urlString.trim());
+                if (iri.isAbsolute()) {
+                    return Optional.of(RDFUtils.iri(iri.toString()));
+                }
+            } catch (RuntimeException e) {
+                //not an absolute iri
+            }
         }
+        return Optional.empty();
     }
 
-    private static boolean isAbsoluteURL(String urlString) {
-        boolean result = false;
+    private static IRI toAbsoluteIRI(IRI documentIRI, String part) throws URISyntaxException {
+        ParsedIRI iri;
         try {
-            URL url = new URL(urlString);
-            String protocol = url.getProtocol();
-            if (protocol != null && protocol.trim().length() > 0)
-                result = true;
-        } catch (MalformedURLException e) {
-            return false;
+            iri = ParsedIRI.create(part.trim());
+        } catch (RuntimeException e) {
+            throw new URISyntaxException(String.valueOf(part), e.getClass().getName() + ": " + e.getMessage());
         }
-        return result;
-    }
 
-    private URL toAbsoluteURL(String ns, String part, char trailing)
-            throws MalformedURLException {
-        if (isAbsoluteURL(part)) {
-            return new URL(part);
+        if (iri.isAbsolute()) {
+            return RDFUtils.iri(iri.toString());
         }
-        char lastChar = ns.charAt(ns.length() - 1);
-        if (lastChar == '#' || lastChar == '/')
-            return new URL(ns + part);
-        return new URL(ns + trailing + part);
+
+        return RDFUtils.iri(new ParsedIRI(documentIRI.toString()).resolve(iri).toString());
     }
 
     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
index 5354924..f04d59f 100644 (file)
@@ -624,7 +624,7 @@ public abstract class AbstractExtractorTestCase extends AbstractAny23TestBase {
    * @param p
    *            predicate
    * @return matching object.
-   * @throws org.openrdf.repository.RepositoryException
+   * @throws org.eclipse.rdf4j.repository.RepositoryException
    */
   protected Value findObject(Resource s, IRI p) throws RepositoryException {
     RepositoryResult<Statement> statements = conn.getStatements(s, p, null,
index 8161b36..1294c93 100644 (file)
@@ -22,6 +22,7 @@ import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
+import org.eclipse.rdf4j.model.IRI;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.junit.Assert;
@@ -172,6 +173,19 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
         logger.debug(dumpHumanReadableTriples());
     }
 
+    @Test
+    public void testMicrodataNestedUrlResolving() throws IOException {
+        IRI oldBaseIRI = baseIRI;
+        try {
+            logger.info("\n");
+            baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html");
+            extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html",
+                    "microdata-nested-url-resolving-expected.nquads");
+        } finally {
+            baseIRI = oldBaseIRI;
+        }
+    }
+
     private void extractAndVerifyAgainstNQuads(String actual, String expected)
     throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
         assertExtract("/microdata/" + actual);
diff --git a/pom.xml b/pom.xml
index 516ed64..392a3cd 100644 (file)
--- a/pom.xml
+++ b/pom.xml
     <httpcore.version>4.4.6</httpcore.version>
     <owlapi.version>5.1.3</owlapi.version>
     <poi.version>3.16</poi.version>
-    <rdf4j.version>2.3.0</rdf4j.version>
+    <rdf4j.version>2.3.1</rdf4j.version>
     <semargl.version>0.7</semargl.version>
     <slf4j.logger.version>1.7.25</slf4j.logger.version>
     <tika.version>1.17</tika.version>
diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
new file mode 100644 (file)
index 0000000..0eb4bcf
--- /dev/null
@@ -0,0 +1,30 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+_:node1causocqkx2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/BlogPosting> <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/alternativeHeadline> "Solution-based problem-solving restricts the result before the start."@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/datePublished> "2013-07-30"^^<http://www.w3.org/2001/XMLSchema#date> <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx3 <http://schema.org/givenName> "Ruben"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html>.
+_:node1causocqkx3 <http://schema.org/familyName> "Verborgh"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx3 <http://schema.org/name> <https://ruben.verborgh.org/> <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx3 <http://schema.org/url> <https://ruben.verborgh.org/> <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/author> _:node1causocqkx3 <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/name> "One hammer for a thousand nails"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/headline> "One hammer for a thousand nails"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+<https://ruben.verborgh.org/tmp/schemaorg-test.html> <http://www.w3.org/1999/xhtml/microdata#item> _:node1causocqkx2 <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+<https://ruben.verborgh.org/tmp/schemaorg-test.html> <http://purl.org/dc/terms/title> "One hammer for a thousand nails | Ruben Verborgh"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html
new file mode 100644 (file)
index 0000000..ec3e677
--- /dev/null
@@ -0,0 +1,35 @@
+<!DOCTYPE HTML>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- Original page source: https://ruben.verborgh.org/tmp/schemaorg-test.html -->
+<html lang="en" prefix="rv: http://ruben.verborgh.org/# og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
+<head>
+    <meta charset="utf-8">
+    <title>One hammer for a thousand nails | Ruben Verborgh</title>
+    <meta property="dc:title" content="One hammer for a thousand nails"/>
+</head>
+<body>
+<article class="blog" itemscope itemtype=http://schema.org/BlogPosting>
+    <h1 itemprop="name headline">One hammer for a thousand nails</h1>
+    <h2 itemprop="alternativeHeadline">Solution-based problem-solving restricts the result before the start.</h2>
+    <p class="signature">
+        <span class="author" itemprop="author" itemscope itemtype=http://schema.org/Person><a itemprop="name url" href="/"><span itemprop="givenName">Ruben</span><span class="spacing"> </span><span itemprop="familyName">Verborgh</span></a></span><br>
+        <time itemprop="datePublished" datetime="2013-07-30T20:30:00+02:00">30 July 2013</time>
+    </p>
+</article>
+</body>
+</html>
index 504b6c8..8c6e70d 100644 (file)
@@ -19,13 +19,13 @@ _:node8b30931f1dde708283dc52546c5572a6 <http://www.w3.org/1999/02/22-rdf-syntax-
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" <http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge Challenger SRT8" <http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com/microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> <http://vheminc.com/> <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node8b30931f1dde708283dc52546c5572a6 <http://bob.example.com/> .
 <http://bob.example.com/> <http://purl.org/dc/terms/title> "HTML5 Microdata Example - http://schema.org/Product" <http://bob.example.com/> .
-<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> .
-<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#shortcut> <http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> .
-<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com//images/favicon1.gif> <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com/images/favicon1.ico> <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#shortcut> <http://bob.example.com/images/favicon1.ico> <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com/images/favicon1.gif> <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#robots> "noarchive" <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#description> "HTML5 Microdata Example for http://schema.org/Product" <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#author> "Edward Lewis" <http://bob.example.com/> .
\ No newline at end of file