ANY23-376 fix IllegalArgumentException in microdata extractor
authorHans <firedrake93@gmail.com>
Tue, 31 Jul 2018 20:35:55 +0000 (15:35 -0500)
committerHans <firedrake93@gmail.com>
Tue, 31 Jul 2018 20:35:55 +0000 (15:35 -0500)
core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads [new file with mode: 0644]
test-resources/src/test/resources/microdata/microdata-bad-properties.html [new file with mode: 0644]

index 32faec3..f305620 100644 (file)
@@ -17,6 +17,7 @@
 package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.html.DomUtils;
+import org.apache.commons.lang.StringUtils;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -394,9 +395,15 @@ public class MicrodataParser {
         while (treeWalker.nextNode() != null);
 
         final List<ItemProp> result = new ArrayList<>();
-        for(Node itemPropNode :  accepted) {
+        for (Node itemPropNode : accepted) {
             final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);
-            final String[] propertyNames = itemProp.split(" ");
+
+            if (StringUtils.isBlank(itemProp)) {
+                manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode));
+                continue;
+            }
+
+            final String[] propertyNames = itemProp.trim().split("\\s+");
             ItemPropValue itemPropValue;
             for (String propertyName : propertyNames) {
                 try {
index 280b3f7..e858ea3 100644 (file)
@@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
@@ -89,7 +90,6 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
         assertExtract("/microdata/microdata-missing-scheme.html");
         assertModelNotEmpty();
         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
-        System.out.println(dumpHumanReadableTriples());
     }
 
     /**
@@ -206,9 +206,20 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
         extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
     }
 
+    @Test
+    public void testBadPropertyNames() throws IOException {
+        extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads", false);
+        assertIssue(IssueReport.IssueLevel.ERROR, ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
+    }
+
     private void extractAndVerifyAgainstNQuads(String actual, String expected)
+            throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
+        extractAndVerifyAgainstNQuads(actual, expected, true);
+    }
+
+    private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
     throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
-        assertExtract("/microdata/" + actual);
+        assertExtract("/microdata/" + actual, assertNoIssues);
         assertModelNotEmpty();
         logger.debug( dumpModelToNQuads() );
         List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
new file mode 100644 (file)
index 0000000..e5b6f29
--- /dev/null
@@ -0,0 +1,84 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+_:node1cjov1p83x2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/endDate> "2018-07-29T17:00:00-07:00" <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/name> "Midwest Fire Fest" <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/description> "Come to the most unique festival in the Midwest" <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/hasMap> "http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523" <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/name> "Westside Park" <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/location> _:node1cjov1p83x3 <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000> <http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/startDate> "2018-07-29T09:00:00-07:00" <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x2 <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/endDate> "2018-07-31T13:00:00-07:00" <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/name> "Cambridge Senior Meals" <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/description> "Cambridge Senior Meals are served¬†at Noon every Tuesday and Friday" <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/streetAddress> "200 Spring Steet" <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/postalCode> "53523" <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/addressLocality> "Cambridge" <http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/addressRegion> "WI" <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/address> _:node1cjov1p83x6 <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/hasMap> "http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523" <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/name> "Amundson Center" <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/location> _:node1cjov1p83x5 <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000> <http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/startDate> "2018-07-31T12:00:00-07:00" <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x4 <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/endDate> "2018-07-31T19:00:00-07:00" <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,..." <http://bob.example.com/> .
+_:node1cjov1p83x8 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x9 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x9 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> .
+_:node1cjov1p83x8 <http://schema.org/address> _:node1cjov1p83x9 <http://bob.example.com/> .
+_:node1cjov1p83x8 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street" <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/location> _:node1cjov1p83x8 <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000> <http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/startDate> "2018-07-31T17:00:00-07:00" <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x7 <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/name> "Cambridge Historic School Museum Tour" <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/description> "Built in 1906, the Cambridge Historic School -¬†listed on the..." <http://bob.example.com/> .
+_:node1cjov1p83x11 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x12 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x12 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> .
+_:node1cjov1p83x11 <http://schema.org/address> _:node1cjov1p83x12 <http://bob.example.com/> .
+_:node1cjov1p83x11 <http://schema.org/name> "Cambridge Historic School" <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/location> _:node1cjov1p83x11 <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000> <http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/startDate> "2018-08-01T12:30:00-07:00" <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x10 <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event> <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,..." <http://bob.example.com/> .
+_:node1cjov1p83x14 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place> <http://bob.example.com/> .
+_:node1cjov1p83x15 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress> <http://bob.example.com/> .
+_:node1cjov1p83x15 <http://schema.org/streetAddress> "Null" <http://bob.example.com/> .
+_:node1cjov1p83x14 <http://schema.org/address> _:node1cjov1p83x15 <http://bob.example.com/> .
+_:node1cjov1p83x14 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street" <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/location> _:node1cjov1p83x14 <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000> <http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/startDate> "2018-08-01T13:00:00-07:00" <http://bob.example.com/> .
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x13 <http://bob.example.com/> .
diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties.html b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
new file mode 100644 (file)
index 0000000..23d4e80
--- /dev/null
@@ -0,0 +1,125 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- Excerpted from: https://cambridgewi.com/events-calendar/ -->
+<html>
+
+<head></head>
+
+<body>
+<div itemscope="" itemtype="http://schema.org/Event">
+    <div>
+        <div>
+            <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000" itemprop="url"><span itemprop="name">Midwest Fire Fest</span></a>
+            <div><span>Jul 29, 2018</span>&nbsp;<span>9:00am</span></div>
+        </div>
+        <div itemprop="description">Come to the most unique festival in the Midwest</div>
+    </div>
+    <meta itemprop=" startDate " content="2018-07-29T09:00:00-07:00">
+    <meta itemprop=" endDate " content="2018-07-29T17:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+        <meta itemprop=" name" content="Westside Park">
+        <meta itemprop="hasMap " content="http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523">
+        <div itemprop="" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+            <meta itemprop="streetAddress" content="300 Water Street">
+            <meta itemprop="addressLocality" content="Cambridge">
+            <meta itemprop="addressRegion" content="WI">
+            <meta itemprop="postalCode" content="53523">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+    <div>
+        <div>
+            <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000" itemprop="url"><span itemprop="name">Cambridge Senior Meals</span></a>
+            <div><span>Jul 31, 2018</span>&nbsp;<span>12:00pm</span></div>
+        </div>
+        <div itemprop="description">Cambridge Senior Meals are served&nbsp;at Noon every Tuesday and Friday</div>
+    </div>
+    <meta itemprop="startDate" content="2018-07-31T12:00:00-07:00">
+    <meta itemprop="endDate" content="2018-07-31T13:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+        <meta itemprop="name" content="Amundson Center">
+        <meta itemprop="hasMap" content="http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523">
+        <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+            <meta itemprop="streetAddress" content="200 Spring Steet">
+            <meta itemprop="addressLocality" content="Cambridge">
+            <meta itemprop="addressRegion" content="WI">
+            <meta itemprop="postalCode" content="53523">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+    <div>
+        <div>
+            <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000" itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+            <div><span>Jul 31, 2018</span>&nbsp;<span>5:00pm</span></div>
+
+        </div>
+        <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,...</div>
+    </div>
+    <meta itemprop="startDate" content="2018-07-31T17:00:00-07:00">
+    <meta itemprop="endDate" content="2018-07-31T19:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+        <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street">
+        <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+    <div>
+        <div>
+            <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000" itemprop="url"><span itemprop="name">Cambridge Historic School Museum Tour</span></a>
+            <div><span>Aug 1, 2018</span>&nbsp;<span>12:30pm</span></div>
+        </div>
+        <div itemprop="description">Built in 1906, the Cambridge Historic School -&nbsp;listed on the...</div>
+    </div>
+    <div class="rhc-clear"></div>
+    <meta itemprop="startDate" content="2018-08-01T12:30:00-07:00">
+    <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+        <meta itemprop="name" content="Cambridge Historic School">
+        <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+    <div>
+        <div>
+            <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000" itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+            <div><span>Aug 1, 2018</span>&nbsp;<span>1:00pm</span></div>
+        </div>
+        <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's speciality yarn,...</div>
+    </div>
+    <meta itemprop="startDate" content="2018-08-01T13:00:00-07:00">
+    <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+    <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+        <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street">
+        <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+            <meta itemprop="streetAddress" content="">
+        </div>
+    </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file