ANY23-383 allow all unicode space characters in JSON-LD
authorHans <firedrake93@gmail.com>
Sat, 4 Aug 2018 05:47:16 +0000 (00:47 -0500)
committerHans <firedrake93@gmail.com>
Sat, 4 Aug 2018 15:06:58 +0000 (10:06 -0500)
cli/pom.xml
core/pom.xml
core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
encoding/pom.xml
mime/pom.xml
pom.xml
test-resources/src/test/resources/html/html-jsonld-bad-character.html [new file with mode: 0644]

index 573646e..fdd7dea 100644 (file)
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: RDF4J -->
index 12cc6ae..49a1bfc 100644 (file)
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-compress</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: RDF4J -->
index 0e32efc..797d878 100644 (file)
@@ -216,7 +216,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
     }
 
 
-    private static class JsonCleaningInputStream extends InputStream {
+    static class JsonCleaningInputStream extends InputStream {
 
         private boolean inEscape;
         private int quoteChar;
@@ -290,25 +290,73 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
                     case ';':
                         //don't write out comma yet!
                         needsComma = true;
-                        break;
+                        continue;
                     case '}':
                     case ']':
                         //discard comma at end of object or array
                         needsComma = false;
                         return c;
-                    default:
-                        if (c != -1 && !Character.isWhitespace(c)) {
-                            if (needsComma) {
-                                stream.unread(c);
-                                stream.unread(' ');
-                                needsComma = false;
-                                return ',';
-                            } else if (c == '"' || c == '\'') {
-                                quoteChar = c;
+                    case -1:
+                    case '\r':
+                    case '\n':
+                        return c;
+                    case 0x09:
+                    case 0x0b:
+                    case 0x0c:
+                    case 0x1c:
+                    case 0x1d:
+                    case 0x1e:
+                    case 0x1f:
+                    case 0x20:
+                        return ' ';
+                    case 0xc2:
+                        if (isNextOrUnread(stream, 0xa0)) {
+                            return ' ';
+                        }
+                        break;
+                    case 0xe1:
+                        if (isNextOrUnread(stream, 0x9a, 0x80)
+                                || isNextOrUnread(stream, 0xa0, 0x8e)) {
+                            return ' ';
+                        }
+                        break;
+                    case 0xe2:
+                        int c1 = stream.read();
+                        if (c1 == 0x80) {
+                            int c2 = stream.read();
+                            //space separators
+                            if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
+                                    //line and paragraph separators
+                                    || c2 == 0xa8 || c2 == 0xa9) {
+                                return ' ';
                             }
+                            stream.unread(c2);
+                        } else if (c1 == 0x81) {
+                            int c2 = stream.read();
+                            if (c2 == 0x9f) {
+                                return ' ';
+                            }
+                            stream.unread(c2);
                         }
-                        return c;
+                        stream.unread(c1);
+                        break;
+                    case 0xe3:
+                        if (isNextOrUnread(stream, 0x80, 0x80)) {
+                            return ' ';
+                        }
+                        break;
+                    default:
+                        break;
+                }
+                if (needsComma) {
+                    stream.unread(c);
+                    stream.unread(' ');
+                    needsComma = false;
+                    return ',';
+                } else if (c == '"' || c == '\'') {
+                    quoteChar = c;
                 }
+                return c;
             }
         }
     }
index 4141bd2..5daedd4 100644 (file)
@@ -90,6 +90,11 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase {
                assertStatementsSize(null, null, null, 4);
        }
 
+       @Test
+       public void testJSONLDBadCharacter() throws Exception {
+               assertExtract("/html/html-jsonld-bad-character.html");
+       }
+
        @Override
        protected ExtractorFactory<?> getExtractorFactory() {
                return new EmbeddedJSONLDExtractorFactory();
index 1e9aa6f..215b552 100644 (file)
  */
 package org.apache.any23.extractor.rdf;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
@@ -29,6 +32,7 @@ import org.apache.any23.writer.RDFXMLWriter;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 import org.eclipse.rdf4j.model.IRI;
@@ -61,7 +65,23 @@ public class JSONLDExtractorTest {
       final IRI uri = RDFUtils.iri("http://host.com/place-example.jsonld");
       extract(uri, "/org/apache/any23/extractor/rdf/place-example.jsonld");
   }
-  
+
+  @Test
+  public void testWhitespaceCleaning() throws Exception {
+    for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
+      if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
+        byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
+        InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new ByteArrayInputStream(bytes));
+        if (i == '\r' || i == '\n') {
+          Assert.assertEquals(stream.read(), i);
+        } else {
+          Assert.assertEquals(stream.read(), ' ');
+        }
+        Assert.assertEquals(stream.read(), -1);
+      }
+    }
+  }
+
   public void extract(IRI uri, String filePath) 
     throws IOException, ExtractionException, TripleHandlerException {
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
index 873c3de..7916ebc 100644 (file)
       <groupId>org.slf4j</groupId> <!-- also replaces httpclient commons-logging dependency -->
       <artifactId>jcl-over-slf4j</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: test dependencies -->
index e4caf5e..c833def 100644 (file)
       <groupId>org.slf4j</groupId>
       <artifactId>jcl-over-slf4j</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
 
diff --git a/pom.xml b/pom.xml
index 50ff0d9..ce2ee5d 100644 (file)
--- a/pom.xml
+++ b/pom.xml
     <tika.version>1.18</tika.version>
     <openie_2.11.version>4.2.6</openie_2.11.version>
     <openregex.version>1.1.1</openregex.version>
+    <jackson.version>2.9.6</jackson.version>
 
     <!-- Overridden in profiles to add JDK specific arguments to surefire -->
     <surefire-extra-args />
         <artifactId>poi-scratchpad</artifactId>
         <version>${poi.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-core</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-databind</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-annotations</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
       <!-- END: Tika -->
 
       <!-- BEGIN: RDF4J -->
diff --git a/test-resources/src/test/resources/html/html-jsonld-bad-character.html b/test-resources/src/test/resources/html/html-jsonld-bad-character.html
new file mode 100644 (file)
index 0000000..659c53c
--- /dev/null
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- Excerpted from: https://america.france.fr/es -->
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>France.fr - La Francia inesperada por aquellas y aquellos que la confo</title>
+</head>
+<body>
+
+<script type="application/ld+json"> 

+{ 

+    "@context": "http://schema.org",
+    

"@type": "WebSite",
+    

"name": "FRANCE.FR",


+    "alternateName": "Atout France",
+    
"url": "https://www.france.fr",
+    
"potentialAction": {


+        "@type": "SearchAction",


+        "target": "https://america.france.fr/es/busqueda?q={search_term_string}",

+        "query-input": "required name=q",
+        "sameAs": ["http:\/\/www.atout-france.fr\/","https:\/\/www.diplomatie.gouv.fr\/es\/","http:\/\/media.atout-france.fr\/","http:\/\/www.meeting.france.fr\/"]
+    }
+
}
+
</script>
+
+</body>
+</html>
\ No newline at end of file