improve JsonCleaningInputStream
authorHans <firedrake93@gmail.com>
Mon, 6 Aug 2018 19:31:08 +0000 (14:31 -0500)
committerHans <firedrake93@gmail.com>
Mon, 6 Aug 2018 19:31:08 +0000 (14:31 -0500)
core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java [new file with mode: 0644]
core/src/test/java/org/apache/any23/Any23Test.java
core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java

index ea582cb..796bada 100644 (file)
@@ -46,7 +46,6 @@ import org.slf4j.LoggerFactory;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.PushbackInputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
@@ -215,150 +214,4 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
         }
     }
 
-
-    static class JsonCleaningInputStream extends InputStream {
-
-        private boolean inEscape;
-        private int quoteChar;
-        private boolean inCDATA;
-        private boolean needsComma;
-
-        private final PushbackInputStream wrapped;
-
-        JsonCleaningInputStream(InputStream in) {
-            wrapped = new PushbackInputStream(in, 16);
-        }
-
-        private static boolean isNextOrUnread(PushbackInputStream stream, int... next) throws IOException {
-            int i = -1;
-            for (int test : next) {
-                int c = stream.read();
-                if (c != test) {
-                    if (c != -1) {
-                        stream.unread(c);
-                    }
-                    while (i >= 0) {
-                        stream.unread(next[i--]);
-                    }
-                    return false;
-                }
-                i++;
-            }
-            return true;
-        }
-
-        @Override
-        public int read() throws IOException {
-            PushbackInputStream stream = wrapped;
-
-            for (;;) {
-                int c = stream.read();
-
-                //other types of comments are handled by enabling fasterxml's
-                //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features
-                if (inCDATA) {
-                    if (c == ']' && isNextOrUnread(stream, ']', '>')) {
-                        inCDATA = false;
-                        continue;
-                    }
-                } else {
-                    if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
-                        inCDATA = true;
-                        continue;
-                    }
-                }
-
-                int q = quoteChar;
-                if (q != 0) {
-                    //we're in a quote
-                    if (inEscape) {
-                        //end escape
-                        inEscape = false;
-                    } else if (c == '\\') {
-                        //begin escape
-                        inEscape = true;
-                    } else if (c == q) {
-                        //end quote
-                        quoteChar = 0;
-                    }
-                    return c;
-                }
-
-                //we're not in a quote
-                switch (c) {
-                    case ',':
-                    case ';':
-                        //don't write out comma yet!
-                        needsComma = true;
-                        continue;
-                    case '}':
-                    case ']':
-                        //discard comma at end of object or array
-                        needsComma = false;
-                        return c;
-                    case -1:
-                    case '\r':
-                    case '\n':
-                        return c;
-                    case 0x09:
-                    case 0x0b:
-                    case 0x0c:
-                    case 0x1c:
-                    case 0x1d:
-                    case 0x1e:
-                    case 0x1f:
-                    case 0x20:
-                        return ' ';
-                    case 0xc2:
-                        if (isNextOrUnread(stream, 0xa0)) {
-                            return ' ';
-                        }
-                        break;
-                    case 0xe1:
-                        if (isNextOrUnread(stream, 0x9a, 0x80)
-                                || isNextOrUnread(stream, 0xa0, 0x8e)) {
-                            return ' ';
-                        }
-                        break;
-                    case 0xe2:
-                        int c1 = stream.read();
-                        if (c1 == 0x80) {
-                            int c2 = stream.read();
-                            //space separators
-                            if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
-                                    //line and paragraph separators
-                                    || c2 == 0xa8 || c2 == 0xa9) {
-                                return ' ';
-                            }
-                            stream.unread(c2);
-                        } else if (c1 == 0x81) {
-                            int c2 = stream.read();
-                            if (c2 == 0x9f) {
-                                return ' ';
-                            }
-                            stream.unread(c2);
-                        }
-                        stream.unread(c1);
-                        break;
-                    case 0xe3:
-                        if (isNextOrUnread(stream, 0x80, 0x80)) {
-                            return ' ';
-                        }
-                        break;
-                    default:
-                        break;
-                }
-                if (needsComma) {
-                    stream.unread(c);
-                    stream.unread(' ');
-                    needsComma = false;
-                    return ',';
-                } else if (c == '"' || c == '\'') {
-                    quoteChar = c;
-                }
-                return c;
-            }
-        }
-    }
-
 }
index 71f2459..1806adf 100644 (file)
@@ -56,15 +56,15 @@ public class JSONLDExtractor extends BaseRDFExtractor {
         }
 
         JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
-        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS);
+        JSON_FACTORY.disable(JsonParser.Feature.ALLOW_COMMENTS); //handled by JsonCleaningInputStream
         JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream
         JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS);
         JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS);
-        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES);
+        JSON_FACTORY.disable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); //handled by JsonCleaningInputStream
         JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream
         JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
         JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES);
-        JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS);
+        JSON_FACTORY.disable(JsonParser.Feature.ALLOW_YAML_COMMENTS); //handled by JsonCleaningInputStream
         JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED);
         JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION);
         JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION);
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java b/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java
new file mode 100644 (file)
index 0000000..bda229e
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.rdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+/**
+ * This class uses several strategies to fix common JSON syntax errors, including:
+ * <ol>
+ *     <li>Remove CDATA markers</li>
+ *     <li>Remove YAML and C-style comments</li>
+ *     <li>Allow single-quoted strings</li>
+ *     <li>Ignore duplicated commas between elements of objects and arrays</li>
+ *     <li>Remove trailing commas from objects and arrays</li>
+ *     <li>Insert omitted commas after objects and arrays</li>
+ *     <li>Ignore all unicode whitespace characters (assumes UTF-8 encoding)</li>
+ *     <li>Treat semi-colons as commas</li>
+ * </ol>
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+class JsonCleaningInputStream extends InputStream {
+
+    private static final int EOL_COMMENT = 1;
+    private static final int MULTILINE_COMMENT = 2;
+
+    private static final int NEEDS_COMMA = 1;
+    private static final int NEEDS_COMMA_AND_NEWLINE = 2;
+
+    private boolean inEscape;
+    private boolean inCDATA;
+    private int needsComma;
+    private int currentState;
+
+    private final PushbackInputStream in;
+
+    JsonCleaningInputStream(InputStream in) {
+        this.in = new PushbackInputStream(in, 16);
+    }
+
+    private static void unread(PushbackInputStream in, int c) throws IOException {
+        if (c != -1) {
+            in.unread(c);
+        }
+    }
+
+    private static boolean isNextOrUnread(PushbackInputStream in, int... next) throws IOException {
+        int i = -1;
+        for (int test : next) {
+            int c = in.read();
+            if (c != test) {
+                unread(in, c);
+                while (i >= 0) {
+                    in.unread(next[i--]);
+                }
+                return false;
+            }
+            i++;
+        }
+        return true;
+    }
+
+    @Override
+    public int read() throws IOException {
+        PushbackInputStream in = this.in;
+
+        for (;;) {
+            int c = in.read();
+
+            if (c == -1) {
+                return c;
+            }
+
+            if (inCDATA) {
+                if (c == ']' && isNextOrUnread(in, ']', '>')) {
+                    inCDATA = false;
+                    continue;
+                }
+            } else {
+                if (c == '<' && isNextOrUnread(in, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
+                    inCDATA = true;
+                    continue;
+                }
+            }
+
+            int ctx = currentState;
+            switch (ctx) {
+                case 0:
+                    break;
+                case EOL_COMMENT:
+                    if (c == '\r' || c == '\n') {
+                        //end single-line comment
+                        currentState = 0;
+                        if (needsComma != 0) {
+                            needsComma = NEEDS_COMMA_AND_NEWLINE;
+                            continue;
+                        }
+                        return c;
+                    }
+                    continue;
+                case MULTILINE_COMMENT:
+                    if (c == '\r' || c == '\n') {
+                        if (needsComma != 0) {
+                            needsComma = NEEDS_COMMA_AND_NEWLINE;
+                            continue;
+                        }
+                        return c;
+                    } else if (c == '*' && isNextOrUnread(in, '/')) {
+                        //end multiline comment
+                        currentState = 0;
+                    }
+                    continue;
+                default:
+                    //we're in a quote
+                    if (inEscape) {
+                        //end escape
+                        inEscape = false;
+                    } else if (c == '\\') {
+                        //begin escape
+                        inEscape = true;
+                    } else if (c == ctx) {
+                        //end quote
+                        currentState = 0;
+                        return '"';
+                    }
+                    return c;
+            }
+
+            $whitespace: {
+                //we're not in a quote
+                switch (c) {
+                    case '#':
+                        currentState = EOL_COMMENT;
+                        continue;
+                    case '/':
+                        int next = in.read();
+                        if (next == '/') {
+                            currentState = EOL_COMMENT;
+                            continue;
+                        } else if (next == '*') {
+                            currentState = MULTILINE_COMMENT;
+                            continue;
+                        }
+                        unread(in, next);
+                        break;
+                    case ',':
+                    case ';':
+                        //don't write out comma yet!
+                        needsComma = NEEDS_COMMA;
+                        continue;
+                    case '}':
+                    case ']':
+                        // Only thing that can follow '}' or ']' is:
+                        // '}' or ']' or ',' or EOF
+                        needsComma = NEEDS_COMMA;
+                        return c;
+                    case '\r':
+                    case '\n':
+                        if (needsComma != 0) {
+                            needsComma = NEEDS_COMMA_AND_NEWLINE;
+                            continue;
+                        }
+                        return c;
+                    // UTF-8 whitespace detection
+                    case 0x09:
+                    case 0x0b:
+                    case 0x0c:
+                    case 0x1c:
+                    case 0x1d:
+                    case 0x1e:
+                    case 0x1f:
+                    case 0x20:
+                        break $whitespace;
+                    case 0xc2:
+                        if (isNextOrUnread(in, 0xa0)) {
+                            break $whitespace;
+                        }
+                        break;
+                    case 0xe1:
+                        if (isNextOrUnread(in, 0x9a, 0x80)
+                                || isNextOrUnread(in, 0xa0, 0x8e)) {
+                            break $whitespace;
+                        }
+                        break;
+                    case 0xe2:
+                        int c1 = in.read();
+                        if (c1 == 0x80) {
+                            int c2 = in.read();
+                            //space separators
+                            if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
+                                    //line and paragraph separators
+                                    || c2 == 0xa8 || c2 == 0xa9) {
+                                break $whitespace;
+                            }
+                            unread(in, c2);
+                            in.unread(0x80);
+                        } else if (c1 == 0x81) {
+                            int c2 = in.read();
+                            if (c2 == 0x9f) {
+                                break $whitespace;
+                            }
+                            unread(in, c2);
+                            in.unread(0x81);
+                        } else {
+                            unread(in, c1);
+                        }
+                        break;
+                    case 0xe3:
+                        if (isNextOrUnread(in, 0x80, 0x80)) {
+                            break $whitespace;
+                        }
+                        break;
+                    default:
+                        break;
+                }
+
+                //here: character is not whitespace
+
+                int nc = needsComma;
+                if (nc != 0) {
+                    in.unread(c);
+                    if (nc == NEEDS_COMMA) {
+                        in.unread(' ');
+                    } else {
+                        for (int i = NEEDS_COMMA_AND_NEWLINE; i < nc; i++) {
+                            in.unread(' ');
+                        }
+                        in.unread('\n');
+                    }
+                    needsComma = 0;
+                    return ',';
+                } else if (c == '"' || c == '\'') {
+                    currentState = c;
+                    return '"';
+                }
+                return c;
+            } //end $whitespace
+
+            //here: character is whitespace
+
+            int nc = needsComma;
+            if (nc != 0) {
+                if (nc != NEEDS_COMMA) {
+                    needsComma = (nc + 1) & 0xFF;
+                }
+                continue;
+            }
+
+            return ' ';
+
+        }
+
+    }
+}
index 085db04..d1d3467 100644 (file)
@@ -345,7 +345,7 @@ public class Any23Test extends Any23OnlineTestBase {
         } finally {
             compositeTH1.close();
         }
-        logger.info(baos.toString());
+        logger.debug(baos.toString());
         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
                 cth1.getCount());
 
index 215b552..f1338b4 100644 (file)
@@ -71,7 +71,7 @@ public class JSONLDExtractorTest {
     for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
       if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
         byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
-        InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new ByteArrayInputStream(bytes));
+        InputStream stream = new JsonCleaningInputStream(new ByteArrayInputStream(bytes));
         if (i == '\r' || i == '\n') {
           Assert.assertEquals(stream.read(), i);
         } else {