ANY23-356 Removed nekohtml dependency
authorHans <firedrake93@gmail.com>
Mon, 2 Jul 2018 20:26:15 +0000 (15:26 -0500)
committerHans <firedrake93@gmail.com>
Mon, 2 Jul 2018 23:24:38 +0000 (18:24 -0500)
api/src/main/resources/default-configuration.properties
cli/pom.xml
core/pom.xml
core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
plugins/html-scraper/pom.xml
pom.xml

index a8ca0c2..4f68586 100644 (file)
@@ -76,7 +76,3 @@ any23.extraction.csv.comment=#
 # A confidence threshold for the OpenIE extractions
 # Any extractions below this value will not be processed.
 any23.extraction.openie.confidence.threshold=0.5
-
-# Use legacy setting to parse html
-# with NekoHTML instead of Jsoup
-any23.tagsoup.legacy=off
index 0cae013..0f04c62 100644 (file)
       <artifactId>commons-codec</artifactId>
     </dependency>
     <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-    </dependency>
-    <dependency>
       <groupId>com.beust</groupId>
       <artifactId>jcommander</artifactId>
     </dependency>
index e492fb6..377a5ee 100644 (file)
       <artifactId>commons-lang</artifactId>
     </dependency>
 
-    <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-    </dependency>
-
     <dependency> <!-- used by Tika -->
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
     </dependency>
     <!-- END: POI -->
 
+    <dependency>
+      <groupId>xerces</groupId>
+      <artifactId>xercesImpl</artifactId>
+    </dependency>
+
     <!-- BEGIN: Test Dependencies -->
     <dependency>
       <groupId>junit</groupId>
index d96a07b..4f54018 100644 (file)
@@ -20,19 +20,10 @@ package org.apache.any23.extractor.html;
 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.XNIException;
-import org.cyberneko.html.parsers.DOMParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
 
-import javax.xml.transform.TransformerException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
@@ -42,13 +33,12 @@ import java.nio.charset.UnsupportedCharsetException;
 
 /**
  * <p>Parses an {@link java.io.InputStream}
- * into an <i>HTML DOM</i> tree using a <i>TagSoup</i> parser.
+ * into an <i>HTML DOM</i> tree.
  * </p>
  * <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
  * aware, and all element names will be upper case, while attributes
- * will be lower case. This is because the
- * <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i> parser
- * by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
+ * will be lower case. This is because the HTML parser
+ * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
  * implementation, which doesn't support namespaces and forces uppercase element names. This works
  * with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p>
  *
@@ -61,8 +51,6 @@ public class TagSoupParser {
 
     public static final String ELEMENT_LOCATION = "Element-Location";
 
-    private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
-
     private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
 
     private final InputStream input;
@@ -139,103 +127,6 @@ public class TagSoupParser {
         return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
     }
 
-
-    static TagSoupParsingConfiguration legacyConfig() {
-        return NekoHTML.instance;
-    }
-
-    private static class NekoHTML extends TagSoupParsingConfiguration {
-
-        private static final NekoHTML instance = new NekoHTML();
-
-        @Override
-        Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
-            try {
-                return parse(input, encoding);
-            } catch (SAXException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
-            } catch (TransformerException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
-            } catch (NullPointerException ex) {
-                if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
-                    throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex);
-                } else {
-                    throw ex;
-                }
-            }
-        }
-
-        private Document parse(InputStream input, String encoding) throws IOException, SAXException, TransformerException {
-            final DOMParser parser = new DOMParser() {
-
-                private QName currentQName;
-                private Augmentations currentAugmentations;
-
-                @Override
-                protected Element createElementNode(QName qName) {
-                    final Element created = super.createElementNode(qName);
-                    if (qName.equals(currentQName) && currentAugmentations != null) {
-                        final ElementLocation elementLocation = createElementLocation(
-                                currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
-                        );
-                        created.setUserData(ELEMENT_LOCATION, elementLocation, null);
-                    }
-                    return created;
-                }
-
-                @Override
-                public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
-                        throws XNIException {
-                    super.startElement(qName, xmlAttributes, augmentations);
-                    currentQName = qName;
-                    currentAugmentations = augmentations;
-                }
-
-                private ElementLocation createElementLocation(Object obj) {
-                    if(obj == null) return null;
-                    String pattern = null;
-                    try {
-                        pattern = obj.toString();
-                        if( "synthesized".equals(pattern) ) return null;
-                        final String[] parts = pattern.split(":");
-                        return new ElementLocation(
-                                Integer.parseInt(parts[0]),
-                                Integer.parseInt(parts[1]),
-                                Integer.parseInt(parts[3]),
-                                Integer.parseInt(parts[4])
-
-                        );
-                    } catch (Exception e) {
-                        logger.warn(
-                                String.format("Unexpected string format for given augmentation: [%s]", pattern),
-                                e
-                        );
-                        return null;
-                    }
-                }
-            };
-            parser.setFeature("http://xml.org/sax/features/namespaces", false);
-            parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
-            parser.setFeature(AUGMENTATIONS_FEATURE, true);
-            if (encoding != null)
-                parser.setProperty("http://cyberneko.org/html/properties/default-encoding", encoding);
-
-            /*
-             * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
-             *       parser. This will ensure the correct handling of inline HTML SPAN tags.
-             *       This fix is documented at issue #78.
-             */
-            parser.parse(new InputSource( new SpanCloserInputStream(input)));
-            return parser.getDocument();
-        }
-
-
-    }
-
-
-
     /**
      * Describes a <i>DOM Element</i> location.
      */
index 2aeaac1..018a333 100644 (file)
@@ -17,7 +17,6 @@
 
 package org.apache.any23.extractor.html;
 
-import org.apache.any23.configuration.DefaultConfiguration;
 import org.jsoup.nodes.Attribute;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
@@ -35,8 +34,6 @@ import java.io.InputStream;
  */
 abstract class TagSoupParsingConfiguration {
 
-    static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";
-
     String name() {
         return getClass().getSimpleName();
     }
@@ -45,14 +42,7 @@ abstract class TagSoupParsingConfiguration {
 
 
     static TagSoupParsingConfiguration getDefault() {
-        return Default.instance;
-    }
-
-    private static class Default {
-
-        private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton()
-                .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance;
-
+        return JsoupConfig.instance;
     }
 
 
index e24f6b6..b651d73 100644 (file)
     <dependency>
       <groupId>net.sourceforge.nekohtml</groupId>
       <artifactId>nekohtml</artifactId>
-      <scope>provided</scope>
+      <version>1.9.22</version>
     </dependency>
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
-      <version>2.12.0</version>
-      <scope>provided</scope>
-      <exclusions>
-        <exclusion>
-          <groupId>xml-apis</groupId>
-          <artifactId>xml-apis</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <dependency>
       <groupId>de.l3s.boilerpipe</groupId>
diff --git a/pom.xml b/pom.xml
index 8d3d408..1e57b2c 100644 (file)
--- a/pom.xml
+++ b/pom.xml
         <version>1.17</version>
       </dependency>
       <dependency>
-        <groupId>net.sourceforge.nekohtml</groupId>
-        <artifactId>nekohtml</artifactId>
-        <version>1.9.20</version>
+        <groupId>xerces</groupId>
+        <artifactId>xercesImpl</artifactId>
+        <version>2.12.0</version>
       </dependency>
       <dependency>
         <groupId>org.jsoup</groupId>