ANY23-382 don't kill extraction on fatal json parsing errors
authorHans <firedrake93@gmail.com>
Fri, 3 Aug 2018 21:06:15 +0000 (16:06 -0500)
committerHans <firedrake93@gmail.com>
Fri, 3 Aug 2018 21:06:15 +0000 (16:06 -0500)
core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
test-resources/src/test/resources/html/html-jsonld-fatal-error.html [new file with mode: 0644]

index c0994bd..0e32efc 100644 (file)
 
 package org.apache.any23.extractor.rdf;
 
+import com.fasterxml.jackson.core.JsonLocation;
+import com.fasterxml.jackson.core.JsonParseException;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
+import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.html.JsoupUtils;
 import org.eclipse.rdf4j.rio.RDFFormat;
 import org.eclipse.rdf4j.rio.RDFParseException;
@@ -197,7 +200,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
         } catch (RDFHandlerException ex) {
             throw new IllegalStateException("Unexpected exception.", ex);
         } catch (RDFParseException ex) {
-            throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
+            Throwable cause = ex.getCause();
+            if (cause instanceof JsonParseException) {
+                JsonParseException err = (JsonParseException)cause;
+                JsonLocation loc = err.getLocation();
+                if (loc == null) {
+                    extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), -1L, -1L);
+                } else {
+                    extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), loc.getLineNr(), loc.getColumnNr());
+                }
+            } else {
+                throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
+            }
         }
     }
 
@@ -205,7 +219,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
     private static class JsonCleaningInputStream extends InputStream {
 
         private boolean inEscape;
-        private boolean inQuote;
+        private int quoteChar;
         private boolean inCDATA;
         private boolean needsComma;
 
@@ -240,13 +254,37 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
             for (;;) {
                 int c = stream.read();
 
-                if (inQuote) {
-                    return readQuoted(c, stream);
+                //other types of comments are handled by enabling fasterxml's
+                //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features
+                if (inCDATA) {
+                    if (c == ']' && isNextOrUnread(stream, ']', '>')) {
+                        inCDATA = false;
+                        continue;
+                    }
+                } else {
+                    if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
+                        inCDATA = true;
+                        continue;
+                    }
                 }
 
-                //we're not in a quote
-                c = stripComments(c, stream);
+                int q = quoteChar;
+                if (q != 0) {
+                    //we're in a quote
+                    if (inEscape) {
+                        //end escape
+                        inEscape = false;
+                    } else if (c == '\\') {
+                        //begin escape
+                        inEscape = true;
+                    } else if (c == q) {
+                        //end quote
+                        quoteChar = 0;
+                    }
+                    return c;
+                }
 
+                //we're not in a quote
                 switch (c) {
                     case ',':
                     case ';':
@@ -258,150 +296,21 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
                         //discard comma at end of object or array
                         needsComma = false;
                         return c;
-                    case -1:
-                        return c;
-                    default:
-                        if (Character.isWhitespace(c)) {
-                            return ' ';
-                        } else if (needsComma) {
-                            stream.unread(c);
-                            stream.unread(' ');
-                            needsComma = false;
-                            return ',';
-                        } else if (c == '"') {
-                            inQuote = true;
-                        }
-                        return c;
-                }
-            }
-
-        }
-
-        private int readQuoted(int c, PushbackInputStream stream) throws IOException {
-            if (inEscape) {
-                switch (c) {
-                    case 'u':
-                        //TODO: validate that 'u' is followed by 4 hex chars?
-                    case '"':
-                    case '\\':
-                    case '/':
-                    case 'b':
-                    case 'f':
-                    case 'n':
-                    case 'r':
-                    case 't':
-                    case -1:
-                        inEscape = false;
-                        return c;
                     default:
-                        stream.unread(c);
-                        inEscape = false;
-                        return '\\';
-                }
-            } else {
-                switch (c) {
-                    case '\\':
-                        break;
-                    case '\n':
-                        stream.unread('n');
-                        break;
-                    case '\r':
-                        stream.unread('r');
-                        break;
-                    case '\b':
-                        stream.unread('b');
-                        break;
-                    case '\f':
-                        stream.unread('f');
-                        break;
-                    case '\t':
-                        stream.unread('t');
-                        break;
-                    case '"':
-                        inQuote = false;
-                        return c;
-                    case -1:
-                        return c;
-                    default:
-                        if (c < 0x20 || c == 0x7f) {
-                            String hex = Integer.toHexString(c);
-                            int ind = hex.length() - 1;
-                            stream.unread(hex.charAt(ind));
-                            stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
-                            stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
-                            stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
-                            stream.unread('u');
-                            break;
-                        } else {
-                            return c;
-                        }
-                }
-                inEscape = true;
-                return '\\';
-            }
-        }
-
-        private int stripComments(int c, PushbackInputStream stream) throws IOException {
-            switch (c) {
-                case '/':
-                    if (isNextOrUnread(stream, '/')) {
-                        //single line comment: read to end of line
-                        for (;;) {
-                            c = stream.read();
-                            if (c == -1 || c == '\r' || c == '\n') {
-                                return c;
+                        if (c != -1 && !Character.isWhitespace(c)) {
+                            if (needsComma) {
+                                stream.unread(c);
+                                stream.unread(' ');
+                                needsComma = false;
+                                return ',';
+                            } else if (c == '"' || c == '\'') {
+                                quoteChar = c;
                             }
                         }
-                    } else if (isNextOrUnread(stream,'*')) {
-                        //multiline comment: read till next "*/"
-                        for (;;) {
-                            c = stream.read();
-                            if (c == -1) {
-                                return c;
-                            } else if (c == '*') {
-                                c = stream.read();
-                                if (c == -1) {
-                                    return c;
-                                } else if (c == '/') {
-                                    //replace entire comment with single space
-                                    return ' ';
-                                }
-                            }
-                        }
-                    } else {
-                        return c;
-                    }
-                case '<':
-                    if (isNextOrUnread(stream,'!','[','C','D','A','T','A','[')) {
-                        inCDATA = true;
-                        return ' ';
-                    } else {
-                        return c;
-                    }
-                case '#':
-                    for (;;) {
-                        c = stream.read();
-                        if (c == -1 || c == '\r' || c == '\n') {
-                            return c;
-                        }
-                    }
-                case ']':
-                    if (inCDATA) {
-                        if (isNextOrUnread(stream, ']', '>')) {
-                            inCDATA = false;
-                            return ' ';
-                        } else {
-                            return c;
-                        }
-                    } else {
                         return c;
-                    }
-                default:
-                    return c;
+                }
             }
-
         }
-
     }
 
 }
index 402e267..71f2459 100644 (file)
 
 package org.apache.any23.extractor.rdf;
 
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
 import com.github.jsonldjava.utils.JsonUtils;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.eclipse.rdf4j.rio.RDFParser;
 
+import java.lang.reflect.Field;
+
 /**
  * Concrete implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor}
  * handling <a href="http://www.w3.org/TR/json-ld/">JSON-LD</a> format.
@@ -41,6 +45,29 @@ public class JSONLDExtractor extends BaseRDFExtractor {
             throw new AssertionError("You have an outdated version of jsonld-java on the classpath. " +
                     "Upgrade to at least version 0.12.0. See: https://issues.apache.org/jira/browse/ANY23-336", th);
         }
+
+        JsonFactory JSON_FACTORY;
+        try {
+            Field field = JsonUtils.class.getDeclaredField("JSON_FACTORY");
+            field.setAccessible(true);
+            JSON_FACTORY = (JsonFactory)field.get(null);
+        } catch (Exception e) {
+            throw new AssertionError(e);
+        }
+
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS);
+        JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS);
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS);
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES);
+        JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES);
+        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS);
+        JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED);
+        JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION);
+        JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION);
     }
 
 
index 2778621..6b4406a 100644 (file)
@@ -284,7 +284,7 @@ public class RDFParserFactory {
     ) {
         parser.getParserConfig().setNonFatalErrors(stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
         parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
-        parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
+        parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
 
         parser.setParseErrorListener(new InternalParseErrorListener(extractionResult));
         parser.setValueFactory(
index 41a0711..4141bd2 100644 (file)
@@ -17,6 +17,7 @@
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.FOAF;
 import org.junit.Test;
@@ -75,13 +76,20 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase {
                assertExtract("/html/html-jsonld-unescaped-characters.html");
                assertModelNotEmpty();
                assertStatementsSize(null, null, null, 375);
-               assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\\\u0008");
+               assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\u0008");
                assertContains(RDFUtils.iri("http://schema.org/description"),
                                "#1 MAGIC SHOW IN L.A.\nThe current WINNER of the CW’s Penn & Teller’s FOOL US, Illusionist " +
                                                "extraordinaire Ivan Amodei is on a national tour with his show INTIMATE ILLUSIONS." +
                                                "\n\nCurrently, on an ei...");
        }
 
+       @Test
+       public void testJSONLDFatalError() {
+               assertExtract("/html/html-jsonld-fatal-error.html",false);
+               assertIssue(IssueReport.IssueLevel.FATAL, ".*Unexpected character .* was expecting comma to separate Object entries.*");
+               assertStatementsSize(null, null, null, 4);
+       }
+
        @Override
        protected ExtractorFactory<?> getExtractorFactory() {
                return new EmbeddedJSONLDExtractorFactory();
diff --git a/test-resources/src/test/resources/html/html-jsonld-fatal-error.html b/test-resources/src/test/resources/html/html-jsonld-fatal-error.html
new file mode 100644 (file)
index 0000000..1ccb7ab
--- /dev/null
@@ -0,0 +1,61 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--> <!-- Excerpted from: http://osl.ugr.es/JSLUGR/ -->
+<html lang="es">
+
+<head>
+    <title>Jornadas de Software Libre de la Universidad de Granada</title>
+</head>
+
+<body id="page-top" data-spy="scroll" data-target=".navbar-fixed-top">
+
+
+
+<script type="application/ld+json">
+       {
+         "@context": "http://schema.org",
+         "@type": "Organization",
+         "url": "http://osl.ugr.es",
+         "contactPoint": [{
+           "@type": "ContactPoint",
+           "email": "osl@ugr.es",
+               "name": "Jornadas de Software Libre"
+           "contactType": "Organizing committee",
+               "url": "http://osl.ugr.es"
+         }]
+       }
+       </script>
+
+<script type="application/ld+json">
+       {
+         "@context": {
+           "ical": "http://www.w3.org/2002/12/cal/ical#",
+           "xsd": "http://www.w3.org/2001/XMLSchema#",
+           "ical:dtstart": {
+             "@type": "xsd:dateTime"
+           }
+         },
+         "ical:summary": "Jornadas de Software Libre",
+         "ical:location": "Por determinar. Granada,  España",
+         "ical:dtstart": "2017-09-27T08:00Z",
+         "ical:dtend": "2017-09-28T16:00Z"
+       }
+       </script>
+
+</body>
+
+</html>
\ No newline at end of file