require strict isomorphism w/online microdata tests' expected models
authorHans <firedrake93@gmail.com>
Tue, 30 Oct 2018 03:28:45 +0000 (22:28 -0500)
committerHans <firedrake93@gmail.com>
Tue, 30 Oct 2018 03:28:45 +0000 (22:28 -0500)
core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
test-resources/src/test/resources/microdata/5.2.1-non-normative-example-2-expected.nquads

index cac6689..50f880f 100644 (file)
@@ -49,7 +49,7 @@ import java.util.Optional;
  */
 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
 
-    private static final IRI MICRODATA_ITEM
+    static final IRI MICRODATA_ITEM
             = RDFUtils.iri("http://www.w3.org/1999/xhtml/microdata#item");
 
     private static final ParsedIRI EMPTY_FRAG = ParsedIRI.create("#");
index c086d3f..8c3c641 100644 (file)
@@ -300,32 +300,6 @@ public class MicrodataParser {
         ps.append("}");
     }
 
-    /**
-     * Returns only nodes that are <b>not</b> nested one each other.
-     *
-     * @param candidates list of candidate nodes.
-     * @return list of unnested nodes.
-     */
-    @SuppressWarnings("unused")
-    private static List<Node> getUnnestedNodes(List<Node> candidates) {
-        final List<Node> unnesteds  = new ArrayList<>();
-        for(int i = 0; i < candidates.size(); i++) {
-            boolean skip = false;
-            for(int j = 0; j < candidates.size(); j++) {
-                if(i == j)
-                    continue;
-                if( DomUtils.isAncestorOf(candidates.get(j), candidates.get(i), true) ) {
-                    skip = true;
-                    break;
-                }
-            }
-            if(!skip) {
-                unnesteds.add( candidates.get(i) );
-            }
-        }
-        return unnesteds;
-    }
-
     public void setErrorMode(ErrorMode errorMode) {
         if(errorMode == null)
             throw new IllegalArgumentException("errorMode must be not null.");
@@ -527,35 +501,42 @@ public class MicrodataParser {
     public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException {
         final Set<Node> accepted = new LinkedHashSet<>();
 
+        boolean skipRootChildren = false;
         if (!skipRoot) {
             NamedNodeMap attributes = scopeNode.getAttributes();
             if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null) {
                 accepted.add(scopeNode);
             }
+            if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
+                skipRootChildren = true;
+            }
         }
 
-        // TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times.
-        TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument())
-            .createTreeWalker(scopeNode, NodeFilter.SHOW_ELEMENT, new NodeFilter() {
-            @Override
-            public short acceptNode(Node node) {
-                if (node.getNodeType() == Node.ELEMENT_NODE) {
-                    NamedNodeMap attributes = node.getAttributes();
-                    if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null && !scopeNode.equals(node)) {
-                        accepted.add(node);
-                    }
+        if (!skipRootChildren) {
+            // TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times.
+            TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument())
+                    .createTreeWalker(scopeNode, NodeFilter.SHOW_ELEMENT, new NodeFilter() {
+                        @Override
+                        public short acceptNode(Node node) {
+                            if (node.getNodeType() == Node.ELEMENT_NODE) {
+                                NamedNodeMap attributes = node.getAttributes();
+                                if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null && !scopeNode.equals(node)) {
+                                    accepted.add(node);
+                                }
+
+                                if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
+                                    // Don't visit descendants of nodes that define a new scope
+                                    return FILTER_REJECT;
+                                }
+                            }
+                            return FILTER_ACCEPT;
+                        }
+                    }, false);
 
-                    if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
-                        // Don't visit descendants of nodes that define a new scope
-                        return FILTER_REJECT;
-                    }
-                }
-                return FILTER_ACCEPT;
-            }
-        }, false);
 
-        // To populate accepted we only need to walk the tree.
-        while (treeWalker.nextNode() != null);
+            // To populate accepted we only need to walk the tree.
+            while (treeWalker.nextNode() != null) ;
+        }
 
         final List<ItemProp> result = new ArrayList<>();
         for (Node itemPropNode : accepted) {
index 11aa353..0e634de 100644 (file)
@@ -197,6 +197,7 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
                 TreeModel actual = new TreeModel();
                 createRunner(MicrodataExtractorFactory.NAME).extract(action.stringValue(), new TripleWriterHandler() {
                     public void writeTriple(Resource s, IRI p, Value o, Resource g) {
+                        if (MicrodataExtractor.MICRODATA_ITEM.equals(p)) return;
                         actual.add(s, p, o);
                     }
                     public void writeNamespace(String prefix, String uri) { }
@@ -214,8 +215,7 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
                     });
                 }
 
-                boolean testPassed = positive ? (expected.isEmpty() ? actual.isEmpty()
-                        : Models.isSubset(expected, actual)) : !Models.isomorphic(expected, actual);
+                boolean testPassed = positive == Models.isomorphic(expected, actual);
                 if (testPassed) {
                     passedTests.incrementAndGet();
                 } else {
index b3e99b0..8eedf33 100644 (file)
@@ -16,7 +16,6 @@
 #
 
 _:nodebdb2c525cf8095abb6954b51432e6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://microformats.org/profile/hcard> <http://bob.example.com/> .
-_:nodebdb2c525cf8095abb6954b51432e6 <http://microformats.org/profile/hcard#street-address> "Avenue Q" <http://bob.example.com/> .
 _:nodebdb2c525cf8095abb6954b51432e6 <http://microformats.org/profile/hcard#fn> "Princeton" <http://bob.example.com/> .
 _:nodebdb2c525cf8095abb6954b51432e6 <http://microformats.org/profile/hcard#given-name> "Princeton" <http://bob.example.com/> .
 _:nodebdb2c525cf8095abb6954b51432e6 <http://microformats.org/profile/hcard#n> _:node5194c3bb9d7f53e4759c6f393d95f88 <http://bob.example.com/> .
@@ -24,7 +23,6 @@ _:node1ffeb2699b75ba7aca5ee3d72adb55a8 <http://microformats.org/profile/hcard#st
 _:nodebdb2c525cf8095abb6954b51432e6 <http://microformats.org/profile/hcard#adr> _:node1ffeb2699b75ba7aca5ee3d72adb55a8 <http://bob.example.com/> .
 <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:nodebdb2c525cf8095abb6954b51432e6 <http://bob.example.com/> .
 _:node7a12e48e321d29211c8b7c2ce396854 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://microformats.org/profile/hcard> <http://bob.example.com/> .
-_:node7a12e48e321d29211c8b7c2ce396854 <http://microformats.org/profile/hcard#street-address> "Avenue Q" <http://bob.example.com/> .
 _:node7a12e48e321d29211c8b7c2ce396854 <http://microformats.org/profile/hcard#fn> "Trekkie" <http://bob.example.com/> .
 _:node7a12e48e321d29211c8b7c2ce396854 <http://microformats.org/profile/hcard#given-name> "Trekkie" <http://bob.example.com/> .
 _:node7a12e48e321d29211c8b7c2ce396854 <http://microformats.org/profile/hcard#n> _:node45173ea18b736c2e9c3136e52ed3727e <http://bob.example.com/> .