ANY23-374 fix schemeless microdata urls
authorHans <firedrake93@gmail.com>
Tue, 31 Jul 2018 17:21:26 +0000 (12:21 -0500)
committerHans <firedrake93@gmail.com>
Tue, 31 Jul 2018 17:24:42 +0000 (12:24 -0500)
core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java
core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
test-resources/src/test/resources/microdata/microdata-missing-scheme.html [new file with mode: 0644]

index 0ab0fee..2f079bb 100644 (file)
@@ -28,6 +28,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 /**
  * This class describes a <b>Microdata <i>itemscope</i></b>.
@@ -75,12 +76,27 @@ public class ItemScope extends Item {
         this(xpath, itemProps, id, refs, stringToUrl(type), itemId);
     }
 
+    private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?");
+
     static URL stringToUrl(String type) {
         if (StringUtils.isNotBlank(type)) {
             try {
-                return new URL(ParsedIRI.create(type.trim()).toString());
+                ParsedIRI iri = ParsedIRI.create(type.trim());
+                if (StringUtils.isBlank(iri.getScheme())) {
+                    String host = iri.getHost();
+                    if (StringUtils.isNotBlank(host)) {
+                        iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment());
+                    } else {
+                        String path = iri.getPath();
+                        if (path != null && looksLikeStartsWithHost.matcher(path).matches()) {
+                            iri = ParsedIRI.create("http://" + iri.toString());
+                        }
+                    }
+                }
+
+                return new URL(iri.toString());
             } catch (MalformedURLException murle) {
-                throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL.");
+                throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage());
             }
         } else {
             return null;
index f2e7852..280b3f7 100644 (file)
@@ -23,6 +23,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.junit.Assert;
@@ -83,6 +84,14 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
         assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
     }
 
+    @Test
+    public void testMicrodataMissingScheme() {
+        assertExtract("/microdata/microdata-missing-scheme.html");
+        assertModelNotEmpty();
+        assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
+        System.out.println(dumpHumanReadableTriples());
+    }
+
     /**
      * Reference test as provided by <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich Snippet for Microdata.</a>
      *
diff --git a/test-resources/src/test/resources/microdata/microdata-missing-scheme.html b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html
new file mode 100644 (file)
index 0000000..af8277f
--- /dev/null
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Missing Scheme</title>
+</head>
+<body>
+
+<div itemscope itemtype="http://schema.org/Question">
+    <h3 itemprop="name">Name</h3>
+    <div itemprop="acceptedAnswer" itemscope itemtype="schema.org/Answer">
+        <p itemprop="text">Text</p>
+    </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file