ANY23-396 Overhaul WriterFactory API
authorHans <firedrake93@gmail.com>
Fri, 14 Sep 2018 15:29:33 +0000 (10:29 -0500)
committerHans <firedrake93@gmail.com>
Tue, 23 Oct 2018 19:14:26 +0000 (14:14 -0500)
41 files changed:
api/pom.xml
api/src/main/java/org/apache/any23/configuration/Setting.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/configuration/Settings.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/writer/TripleFormat.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/writer/TripleWriter.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java [new file with mode: 0644]
api/src/main/java/org/apache/any23/writer/WriterFactory.java
api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java
api/src/test/java/org/apache/any23/configuration/SettingsTest.java [new file with mode: 0644]
api/src/test/java/org/apache/any23/writer/TripleFormatTest.java [new file with mode: 0644]
cli/src/main/java/org/apache/any23/cli/Rover.java
cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java [new file with mode: 0644]
cli/src/test/java/org/apache/any23/cli/RoverTest.java
cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java [new file with mode: 0644]
cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java [new file with mode: 0644]
cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory [new file with mode: 0644]
core/src/main/java/org/apache/any23/writer/JSONLDWriter.java
core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java
core/src/main/java/org/apache/any23/writer/JSONWriter.java
core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java
core/src/main/java/org/apache/any23/writer/NQuadsWriter.java
core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java
core/src/main/java/org/apache/any23/writer/NTriplesWriter.java
core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java
core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java
core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java
core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java
core/src/main/java/org/apache/any23/writer/TriXWriter.java
core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java
core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java [new file with mode: 0644]
core/src/main/java/org/apache/any23/writer/TurtleWriter.java
core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java
core/src/main/java/org/apache/any23/writer/URIListWriter.java
core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java
core/src/main/java/org/apache/any23/writer/WriterSettings.java [new file with mode: 0644]
core/src/main/java/org/apache/any23/writer/package-info.java
core/src/test/java/org/apache/any23/writer/JSONWriterTest.java
core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java
service/src/main/java/org/apache/any23/servlet/WebResponder.java
test-resources/src/test/resources/cli/basic-with-stylesheet.html [new file with mode: 0644]

index ae275bd..748db36 100644 (file)
       <groupId>org.eclipse.rdf4j</groupId>
       <artifactId>rdf4j-rio-api</artifactId>
     </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/api/src/main/java/org/apache/any23/configuration/Setting.java b/api/src/main/java/org/apache/any23/configuration/Setting.java
new file mode 100644 (file)
index 0000000..6932afd
--- /dev/null
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.configuration;
+
+import java.lang.reflect.GenericArrayType;
+import java.lang.reflect.ParameterizedType;
+import java.lang.reflect.Type;
+import java.lang.reflect.TypeVariable;
+import java.util.HashMap;
+import java.util.Optional;
+import java.util.regex.Pattern;
+
+/**
+ * Represents a {@link Setting.Key Key} paired with a compatible value.
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public final class Setting<V> {
+
+    /**
+     * Convenience method for creating a new setting key with the specified identifier and value class.
+     * If the desired value type is a {@link ParameterizedType} such as {@code List<String>},
+     * or custom value-checking is required, then this method is not appropriate; instead,
+     * extend the {@link Key} class directly.
+     *
+     * @param identifier a unique identifier for this key
+     * @param valueType the type of value allowed by this key
+     * @return a new {@link Key} instance initialized with the specified identifier and value type
+     * @throws IllegalArgumentException if the identifier or value type is invalid
+     */
+    public static <V> Key<V> newKey(String identifier, Class<V> valueType) {
+        return new Key<V>(identifier, valueType) {};
+    }
+
+    /**
+     * Represents the key for a {@link Setting}.
+     */
+    public static abstract class Key<V> {
+        private final String identifier;
+        private final Type valueType;
+
+        private Key(String identifier, Class<V> valueType) {
+            this.identifier = checkIdentifier(identifier);
+            if ((this.valueType = valueType) == null) {
+                throw new IllegalArgumentException("value type cannot be null");
+            }
+
+            if (valueType.isArray()) {
+                throw new IllegalArgumentException(identifier + " value class must be immutable");
+            } else if (valueType.getTypeParameters().length != 0) {
+                throw new IllegalArgumentException(identifier + " setting key must fill in type parameters for " + valueType.toGenericString());
+            } else if (valueType.isPrimitive()) {
+                //ensure using primitive wrapper classes
+                //so that Class.isInstance(), etc. will work as expected
+                throw new IllegalArgumentException(identifier + " value class cannot be primitive");
+            }
+        }
+
+        private static final Pattern identifierPattern = Pattern.compile("[a-z][0-9a-z]*(\\.[a-z][0-9a-z]*)*");
+        private static String checkIdentifier(String identifier) {
+            if (identifier == null) {
+                throw new IllegalArgumentException("identifier cannot be null");
+            }
+            if (!identifierPattern.matcher(identifier).matches()) {
+                throw new IllegalArgumentException("identifier does not match " + identifierPattern.pattern());
+            }
+            return identifier;
+        }
+
+        /**
+         * Constructs a new key with the specified identifier.
+         * @param identifier the identifier for this key
+         * @throws IllegalArgumentException if the identifier is invalid, or the value type was determined to be invalid
+         */
+        protected Key(String identifier) {
+            this.identifier = checkIdentifier(identifier);
+
+            Type type = valueType = getValueType();
+
+            if (type instanceof Class) {
+                if (((Class) type).isArray()) {
+                    throw new IllegalArgumentException(identifier + " value class must be immutable");
+                } else if (((Class) type).getTypeParameters().length != 0) {
+                    throw new IllegalArgumentException(identifier + " setting key must fill in type parameters for " + ((Class) type).toGenericString());
+                }
+            } else if (type instanceof GenericArrayType) {
+                throw new IllegalArgumentException(identifier + " value class must be immutable");
+            } else if (type instanceof TypeVariable) {
+                throw new IllegalArgumentException("Invalid setting key type 'Key<" + type.getTypeName() + ">' for identifier " + identifier);
+            } else if (!(type instanceof ParameterizedType)) {
+                throw new IllegalArgumentException(identifier + " invalid key type " + type + " (" + type.getClass().getName() + ")");
+            }
+        }
+
+        private Type getValueType() {
+            HashMap<TypeVariable<?>, Type> mapping = new HashMap<>();
+            Class<?> rawType = getClass();
+            assert rawType != Key.class;
+            for (;;) {
+                Type superclass = rawType.getGenericSuperclass();
+                if (superclass instanceof ParameterizedType) {
+                    rawType = (Class)((ParameterizedType) superclass).getRawType();
+                    Type[] args = ((ParameterizedType) superclass).getActualTypeArguments();
+                    if (Key.class.equals(rawType)) {
+                        Type t = args[0];
+                        return mapping.getOrDefault(t, t);
+                    }
+                    TypeVariable<?>[] vars = rawType.getTypeParameters();
+                    for (int i = 0, len = vars.length; i < len; i++) {
+                        Type t = args[i];
+                        mapping.put(vars[i], t instanceof TypeVariable ? mapping.get(t) : t);
+                    }
+                } else {
+                    rawType = (Class<?>)superclass;
+                    if (Key.class.equals(rawType)) {
+                        throw new IllegalArgumentException(getClass() + " does not supply type arguments");
+                    }
+                }
+            }
+        }
+
+        /**
+         * Subclasses may override this method to check that new settings for this key are valid.
+         * The default implementation of this method throws a {@link NullPointerException} if the new value is null and the initial value was non-null.
+         *
+         * @param initial the setting containing the initial value for this key, or null if the setting has not yet been initialized
+         * @param newValue the new value for this setting
+         * @throws Exception if the new value for this setting was invalid
+         */
+        protected void checkValue(Setting<V> initial, V newValue) throws Exception {
+            if (newValue == null && initial != null && initial.value != null) {
+                throw new NullPointerException();
+            }
+        }
+
+        private Setting<V> checked(Setting<V> origin, V value) {
+            try {
+                checkValue(origin, value);
+            } catch (Exception e) {
+                throw new IllegalArgumentException("invalid value for key '" + identifier + "': " + value, e);
+            }
+            return new Setting<>(this, value);
+        }
+
+        /**
+         * @return a new {@link Setting} object with this key and the supplied value.
+         *
+         * @throws IllegalArgumentException if the new value was invalid, as determined by:
+         * <pre>
+         *      {@code this.checkValue(null, value)}
+         * </pre>
+         *
+         * @see #checkValue(Setting, V)
+         */
+        public final Setting<V> withValue(V value) {
+            return checked(null, value);
+        }
+
+        /**
+         * @param o the object to check for equality
+         * @return {@code this == o}
+         */
+        public final boolean equals(Object o) {
+            return super.equals(o);
+        }
+
+        /**
+         * @return the identity-based hashcode of this key
+         */
+        public final int hashCode() {
+            return super.hashCode();
+        }
+
+        public String toString() {
+            return identifier + ": " + valueType.getTypeName();
+        }
+    }
+
+    private final Key<V> key;
+    private final V value;
+
+    private Setting(Key<V> key, V value) {
+        this.key = key;
+        this.value = value;
+    }
+
+    /**
+     * @return the identifier for this setting
+     */
+    public String getIdentifier() {
+        return key.identifier;
+    }
+
+    /**
+     * @return the value for this setting
+     */
+    public V getValue() {
+        return value;
+    }
+
+    /**
+     * @return the type of value supported for this setting
+     */
+    public Type getValueType() {
+        return key.valueType;
+    }
+
+    /**
+     * @return the supplied setting, if it has the same key as this setting
+     */
+    @SuppressWarnings("unchecked")
+    public final Optional<Setting<V>> cast(Setting<?> setting) {
+        return setting == null || setting.key != this.key ? Optional.empty() : Optional.of((Setting<V>)setting);
+    }
+
+    /**
+     * @return a new {@link Setting} object with this setting's {@link Key Key} and the supplied value.
+     *
+     * @throws IllegalArgumentException if the new value was invalid, as determined by:
+     * <pre>
+     *     {@code this.key.checkValue(this, newValue)}
+     * </pre>
+     *
+     * @see Key#checkValue(Setting, V)
+     */
+    public Setting<V> withValue(V newValue) {
+        return key.checked(this, newValue);
+    }
+
+    /**
+     * @return true if the supplied object is an instance of {@link Setting} and has the same key and value as this object.
+     */
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof Setting)) return false;
+
+        Setting<?> setting = (Setting<?>) o;
+
+        if (key != setting.key) return false;
+        return value != null ? value.equals(setting.value) : setting.value == null;
+    }
+
+    @Override
+    public int hashCode() {
+        return 31 * key.hashCode() + (value != null ? value.hashCode() : 0);
+    }
+
+    @Override
+    public String toString() {
+        return key.identifier + "=" + value;
+    }
+
+}
diff --git a/api/src/main/java/org/apache/any23/configuration/Settings.java b/api/src/main/java/org/apache/any23/configuration/Settings.java
new file mode 100644 (file)
index 0000000..1289be3
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.configuration;
+
+import java.util.AbstractSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+
+/**
+ * This class represents an <i>immutable</i> {@link Set} of {@link Setting} objects,
+ * with the additional property that no two settings having the same {@link Setting#getIdentifier() identifier}
+ * can be simultaneously present in a {@code Settings} object.
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public final class Settings extends AbstractSet<Setting<?>> {
+
+    private static final Settings EMPTY_SETTINGS = new Settings(Collections.emptyMap());
+
+    private final Map<String, Setting<?>> values;
+
+    private Settings(Map<String, Setting<?>> values) {
+        this.values = values;
+    }
+
+    /**
+     * Returns the setting with the same {@link Setting.Key Key} as the supplied setting, if present.
+     */
+    public <E> Optional<Setting<E>> find(Setting<E> setting) {
+        return setting.cast(values.get(setting.getIdentifier()));
+    }
+
+    /**
+     * Returns the value set for {@code defaultSetting}'s {@link Setting.Key Key}, if present.
+     * Otherwise, returns {@code defaultSetting}'s value.
+     * <br><br>
+     * This method is semantically equivalent to:
+     * <br><br>
+     * <pre>
+     * {@code find(defaultSetting).orElse(defaultSetting).getValue()}
+     * </pre>
+     */
+    public <E> E get(Setting<E> defaultSetting) {
+        return find(defaultSetting).orElse(defaultSetting).getValue();
+    }
+
+
+    ///////////////////////////////////////
+    // AbstractSet overrides
+    ///////////////////////////////////////
+
+    @Override
+    public boolean contains(Object o) {
+        if (!(o instanceof Setting<?>)) {
+            return false;
+        }
+        return o.equals(values.get(((Setting<?>) o).getIdentifier()));
+    }
+
+    @Override
+    public int size() {
+        return values.size();
+    }
+
+    @Override
+    public Iterator<Setting<?>> iterator() {
+        return values.values().iterator();
+    }
+
+    ///////////////////////////////////////
+    // public constructors
+    ///////////////////////////////////////
+
+    /**
+     * Returns an empty {@link Settings} object.
+     */
+    public static Settings of() {
+        return EMPTY_SETTINGS;
+    }
+
+    /**
+     * Returns a singleton {@link Settings} object, containing only the supplied setting.
+     */
+    public static Settings of(Setting<?> s) {
+        return new Settings(Collections.singletonMap(s.getIdentifier(), s));
+    }
+
+    /**
+     * Returns a {@link Settings} object containing the supplied settings.
+     * For any two settings having the same key, the first will be overwritten by the second.
+     * @throws IllegalArgumentException if any two settings have the same identifier
+     */
+    public static Settings of(Setting<?>... settings) {
+        Map<String, Setting<?>> map = mapForSize(settings.length);
+        for (Setting<?> s : settings) put(map, s);
+        return ofModifiable(map);
+    }
+
+    /**
+     * Returns a {@link Settings} object containing the supplied settings.
+     * @throws IllegalArgumentException if any two settings have the same identifier
+     */
+    public static Settings of(Collection<? extends Setting<?>> c) {
+        if (c instanceof Settings) {
+            return (Settings)c;
+        }
+        int size = c.size();
+        if (size == 0) {
+            return EMPTY_SETTINGS;
+        }
+        Map<String, Setting<?>> map = mapForSize(size);
+        for (Setting<?> s : c) put(map, s);
+        return ofModifiable(map);
+    }
+
+    ///////////////////////////////////////
+    // Private static helpers
+    ///////////////////////////////////////
+
+    private static Settings ofModifiable(Map<String, Setting<?>> map) {
+        return new Settings(Collections.unmodifiableMap(map));
+    }
+
+    private static void put(Map<String, Setting<?>> map, Setting<?> setting) {
+        Setting<?> existing = map.put(setting.getIdentifier(), setting);
+        if (existing != null) {
+            throw new IllegalArgumentException(setting.getIdentifier() + " is already defined");
+        }
+    }
+
+    private static final float loadFactor = 0.75f;
+    private static Map<String, Setting<?>> mapForSize(int size) {
+        return new HashMap<>((int)(size / loadFactor) + 1, loadFactor);
+    }
+
+}
diff --git a/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java b/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java
new file mode 100644 (file)
index 0000000..cc66372
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.apache.any23.configuration.Settings;
+
+/**
+ * Base interface used for constructors of decorating {@link TripleHandler} implementations.
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public interface DecoratingWriterFactory extends BaseWriterFactory<TripleHandler> {
+
+    /**
+     *
+     * @return the settings supported by handlers produced by this factory
+     */
+    @Override
+    Settings getSupportedSettings();
+
+    /**
+     * @param delegate the {@link TripleWriter} to delegate input to
+     * @param settings the settings with which to configure the returned handler
+     * @return a {@link TripleHandler} which writes to the specified delegate
+     * @throws NullPointerException if the delegate or settings is null
+     * @throws IllegalArgumentException if the settings are not correctly configured
+     */
+    @Override
+    TripleHandler getTripleWriter(TripleHandler delegate, Settings settings);
+
+}
diff --git a/api/src/main/java/org/apache/any23/writer/TripleFormat.java b/api/src/main/java/org/apache/any23/writer/TripleFormat.java
new file mode 100644 (file)
index 0000000..01292eb
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.ValueFactory;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.rio.RDFFormat;
+
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+/**
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public class TripleFormat {
+    private final String name;
+    private final IRI standardIRI;
+    private final List<String> mimeTypes;
+    private final Charset charset;
+    private final List<String> fileExtensions;
+    private final Capabilities capabilities;
+    RDFFormat rdfFormat;
+
+    private static final ValueFactory vf = SimpleValueFactory.getInstance();
+
+    private static final int WRITES_TRIPLES = 1;
+    private static final int WRITES_GRAPHS = 1 << 1;
+    private static final int WRITES_NAMESPACES = 1 << 2;
+
+    public static final Capabilities NONSTANDARD = new Capabilities(0);
+    public static final Capabilities TRIPLES = new Capabilities(WRITES_TRIPLES);
+    public static final Capabilities QUADS = new Capabilities(WRITES_TRIPLES | WRITES_GRAPHS);
+    public static final Capabilities TRIPLES_AND_NAMESPACES = TRIPLES.withNamespaces();
+    public static final Capabilities QUADS_AND_NAMESPACES = QUADS.withNamespaces();
+
+    public static class Capabilities {
+        private final int raw;
+
+        private Capabilities(int raw) {
+            this.raw = raw;
+        }
+
+        public boolean has(Capabilities other) {
+            int oraw = other.raw;
+            return (raw & oraw) == oraw;
+        }
+
+        private Capabilities withNamespaces() {
+            return new Capabilities(raw | WRITES_NAMESPACES);
+        }
+
+        //TODO: add "supportsComments()"
+    }
+
+    private static IllegalArgumentException mimeTypeErr(String mt) {
+        return new IllegalArgumentException(mt + " is not a valid mimetype");
+    }
+
+    private static IllegalArgumentException extensionErr(String ext) {
+        return new IllegalArgumentException(ext + " is not a valid extension");
+    }
+
+    private static <E> E checkNonNull(E object, String name) {
+        if (object == null) {
+            throw new IllegalArgumentException(name + " must not be null");
+        }
+        return object;
+    }
+
+    //see https://tools.ietf.org/html/rfc2045#section-5.1
+    private static void checkMimeTypes(List<String> mts) {
+        if (checkNonNull(mts, "mimetypes").isEmpty()) {
+            throw new IllegalArgumentException("mimetypes must not be empty");
+        }
+        for (String mt : mts) {
+            boolean slash = false;
+            for (int i = 0, len = checkNonNull(mt, "mimetype").length(); i < len; i++) {
+                char ch = mt.charAt(i);
+                if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' ||
+                        ch == '<' || ch == '>' || ch == '@' || ch == ',' ||
+                        ch == ';' || ch == ':' || ch == '\\' || ch == '"' ||
+                        ch == '[' || ch == ']' || ch == '?' || ch == '='
+                        //also disallow wildcards:
+                        || ch == '*') {
+                    throw mimeTypeErr(mt);
+                } else if (ch == '/') {
+                    if (slash || i == 0 || i + 1 == len) {
+                        throw mimeTypeErr(mt);
+                    }
+                    slash = true;
+                }
+            }
+            if (!slash) {
+                throw mimeTypeErr(mt);
+            }
+        }
+    }
+
+    private static void checkExtensions(List<String> exts) {
+        for (String ext : checkNonNull(exts, "extensions")) {
+            int illegalDot = 0;
+            for (int i = 0, len = checkNonNull(ext, "extension").length(); i < len; i++) {
+                char ch = ext.charAt(i);
+                if (ch <= ' ' || ch >= 127 || ch == '<' || ch == '>' ||
+                        ch == ':' || ch == '"' || ch == '/' || ch == '\\' ||
+                        ch == '|' || ch == '?' || ch == '*') {
+                    throw extensionErr(ext);
+                } else if (ch == '.') {
+                    int next = i + 1;
+                    if (i == illegalDot || next == len) {
+                        throw extensionErr(ext);
+                    }
+                    illegalDot = next;
+                }
+            }
+        }
+    }
+
+    private static String normalizeMimeType(String mt) {
+        return mt.toLowerCase(Locale.ENGLISH);
+    }
+
+    private static String normalizeExtension(String ext) {
+        return ext.toLowerCase(Locale.ENGLISH);
+    }
+
+    private TripleFormat(String name, Collection<String> mimeTypes, Charset charset,
+                     Collection<String> fileExtensions, String standardIRI, Capabilities capabilities) {
+        this.name = checkNonNull(name, "display name");
+        checkMimeTypes(this.mimeTypes = Collections.unmodifiableList(mimeTypes.stream()
+                .map(TripleFormat::normalizeMimeType).distinct().collect(Collectors.toList())));
+        if ((this.charset = charset) != null && !charset.canEncode()) {
+            throw new IllegalArgumentException(charset + " does not allow encoding");
+        }
+        checkExtensions(this.fileExtensions = Collections.unmodifiableList(fileExtensions.stream()
+                .map(TripleFormat::normalizeExtension).distinct().collect(Collectors.toList())));
+        this.standardIRI = standardIRI == null ? null : vf.createIRI(standardIRI);
+        this.capabilities = checkNonNull(capabilities, "capabilities");
+    }
+
+    public static TripleFormat of(String displayName, Collection<String> mimeTypes, Charset defaultCharset,
+                                  Collection<String> fileExtensions, String standardIRI, Capabilities capabilities) {
+        return new TripleFormat(displayName, mimeTypes, defaultCharset, fileExtensions, standardIRI, capabilities);
+    }
+
+    public Optional<Charset> getCharset() {
+        return Optional.ofNullable(charset);
+    }
+
+    static Capabilities capabilities(RDFFormat format) {
+        if (format.supportsContexts()) {
+            return format.supportsNamespaces() ? QUADS_AND_NAMESPACES : QUADS;
+        } else {
+            return format.supportsNamespaces() ? TRIPLES_AND_NAMESPACES : TRIPLES;
+        }
+    }
+
+    private static String iri(IRI iri) {
+        return iri == null ? null : iri.stringValue();
+    }
+
+    static TripleFormat of(RDFFormat format) {
+        TripleFormat f = of(format.getName(), format.getMIMETypes(),
+                format.getCharset(), format.getFileExtensions(), iri(format.getStandardURI()),
+                capabilities(format));
+        f.rdfFormat = format;
+        return f;
+    }
+
+    RDFFormat toRDFFormat() {
+        RDFFormat fmt = rdfFormat;
+        if (fmt != null) {
+            return fmt;
+        }
+        Capabilities capabilities = this.capabilities;
+        if (!capabilities.has(TRIPLES)) {
+            throw new UnsupportedOperationException("This format does not print RDF triples");
+        }
+        return rdfFormat = new RDFFormat(name, mimeTypes, charset, fileExtensions, standardIRI,
+                capabilities.has(TRIPLES_AND_NAMESPACES), capabilities.has(QUADS));
+    }
+
+    public Optional<IRI> getStandardIRI() {
+        return Optional.ofNullable(standardIRI);
+    }
+
+    public List<String> getMimeTypes() {
+        return mimeTypes;
+    }
+
+    public String getMimeType() {
+        return mimeTypes.get(0);
+    }
+
+    public List<String> getExtensions() {
+        return fileExtensions;
+    }
+
+    public Optional<String> getExtension() {
+        return fileExtensions.isEmpty() ? Optional.empty() : Optional.of(fileExtensions.get(0));
+    }
+
+    public Capabilities getCapabilities() {
+        return capabilities;
+    }
+
+    public String getDisplayName() {
+        return name;
+    }
+
+    public String toString() {
+        return name + mimeTypes.stream().collect(
+                Collectors.joining(", ", " (mimeTypes=", "; "))
+                + fileExtensions.stream().collect(
+                        Collectors.joining(", ", "ext=", ")"));
+    }
+
+}
diff --git a/api/src/main/java/org/apache/any23/writer/TripleWriter.java b/api/src/main/java/org/apache/any23/writer/TripleWriter.java
new file mode 100644 (file)
index 0000000..3800045
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+
+/**
+ * Base interface for triple writers that don't need an extraction context to write triples
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public interface TripleWriter extends AutoCloseable {
+
+    /**
+     * Writes a triple and, optionally, a graph resource name.
+     * @param s the subject to write
+     * @param p the predicate to write
+     * @param o the object to write
+     * @param g the graph name to write, or null
+     * @throws TripleHandlerException if there is an error writing the triple
+     */
+    void writeTriple(Resource s, IRI p, Value o, Resource g) throws TripleHandlerException;
+
+    /**
+     * Writes a prefix-namespace mapping. <br><b>NOTE:</b> this method should be called
+     * <b>before</b> writing out any triples. Calling this method <b>after</b> writing
+     * out a triple may result in the prefix-namespace mapping being ignored.
+     * @param prefix the namespace prefix
+     * @param uri the namespace uri
+     * @throws TripleHandlerException if there was an error writing out the prefix-namespace mapping
+     */
+    void writeNamespace(String prefix, String uri) throws TripleHandlerException;
+
+    /**
+     * Releases resources associated with this {@link TripleWriter}, and flushes (but by default does not close)
+     * any underlying {@link java.io.OutputStream}s. Future invocations of methods of this writer
+     * produce <b>undefined behavior</b> after this method has been called.
+     * @throws TripleHandlerException if there was an error closing this {@link TripleWriter}
+     */
+    @Override
+    void close() throws TripleHandlerException;
+
+}
diff --git a/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java b/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java
new file mode 100644 (file)
index 0000000..20d4995
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.apache.any23.configuration.Settings;
+import org.apache.any23.extractor.ExtractionContext;
+import org.eclipse.rdf4j.common.lang.FileFormat;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.rio.RDFFormat;
+
+import java.io.OutputStream;
+
+/**
+ * Base interface for constructors of {@link TripleHandler} implementations
+ * that write to an {@link OutputStream} using a particular {@link FileFormat}.
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public interface TripleWriterFactory extends BaseWriterFactory<OutputStream> {
+
+    /**
+     * @deprecated since 2.3. Use {@link #getTripleFormat()} instead.
+     */
+    @Override
+    @Deprecated
+    default RDFFormat getRdfFormat() {
+        return getTripleFormat().toRDFFormat();
+    }
+
+    /**
+     * @return the format used to write to {@link OutputStream}s
+     */
+    TripleFormat getTripleFormat();
+
+    /**
+     * @deprecated since 2.3. Use {@link #getTripleFormat()}.{@link TripleFormat#getMimeType() getMimeType()} instead.
+     */
+    @Override
+    @Deprecated
+    default String getMimeType() {
+        return getTripleFormat().getMimeType();
+    }
+
+    /**
+     * @deprecated since 2.3. Use {@link #getTripleWriter(OutputStream, Settings)} instead.
+     */
+    @Override
+    @Deprecated
+    default FormatWriter getRdfWriter(OutputStream os) {
+        TripleHandler th = getTripleWriter(os, Settings.of());
+        return th instanceof FormatWriter ? (FormatWriter)th : new FormatWriter() {
+            @Override
+            public boolean isAnnotated() {
+                return false;
+            }
+            @Override
+            public void setAnnotated(boolean f) {}
+            @Override
+            public void startDocument(IRI documentIRI) throws TripleHandlerException {
+                th.startDocument(documentIRI);
+            }
+            @Override
+            public void openContext(ExtractionContext context) throws TripleHandlerException {
+                th.openContext(context);
+            }
+            @Override
+            public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException {
+                th.receiveTriple(s, p, o, g, context);
+            }
+            @Override
+            public void receiveNamespace(String prefix, String uri, ExtractionContext context) throws TripleHandlerException {
+                th.receiveNamespace(prefix, uri, context);
+            }
+            @Override
+            public void closeContext(ExtractionContext context) throws TripleHandlerException {
+                th.closeContext(context);
+            }
+            @Override
+            public void endDocument(IRI documentIRI) throws TripleHandlerException {
+                th.endDocument(documentIRI);
+            }
+            @Override
+            public void setContentLength(long contentLength) {
+                th.setContentLength(contentLength);
+            }
+            @Override
+            public void close() throws TripleHandlerException {
+                th.close();
+            }
+        };
+    }
+
+
+    /**
+     *
+     * @return the settings supported by writers produced by this factory
+     */
+    @Override
+    Settings getSupportedSettings();
+
+
+    /**
+     * @param out the {@link OutputStream} to write to
+     * @param settings the settings with which to configure the writer
+     * @return a {@link TripleHandler} which writes to the specified {@link OutputStream}
+     * @throws NullPointerException if the output stream or settings is null
+     * @throws IllegalArgumentException if the settings are not correctly configured
+     */
+    @Override
+    TripleHandler getTripleWriter(OutputStream out, Settings settings);
+
+}
index 3012beb..060177b 100644 (file)
@@ -19,18 +19,59 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
+import org.apache.any23.configuration.Settings;
 import org.eclipse.rdf4j.rio.RDFFormat;
 
 /**
- * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * The superinterface of all {@link TripleHandler} factory interfaces.
+ * Do not implement this interface directly. Instead, implement one of the subinterfaces {@link TripleWriterFactory} or {@link DecoratingWriterFactory}.
+ * @author Peter Ansell (p_ansell@yahoo.com)
+ * @author Hans Brende (hansbrende@apache.org)
  */
 public interface WriterFactory {
+
+    /**
+     * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleFormat()} instead.
+     */
+    @Deprecated
     RDFFormat getRdfFormat();
 
     String getIdentifier();
 
+    /**
+     * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleFormat()}.{@link TripleFormat#getMimeType() getMimeType()} instead.
+     */
+    @Deprecated
     String getMimeType();
 
+    /**
+     * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleWriter(OutputStream, Settings)} instead.
+     */
+    @Deprecated
     FormatWriter getRdfWriter(OutputStream os);
 }
+
+interface BaseWriterFactory<Output> extends WriterFactory {
+
+    Settings getSupportedSettings();
+
+    TripleHandler getTripleWriter(Output output, Settings settings);
+
+    @Override
+    @Deprecated
+    default FormatWriter getRdfWriter(OutputStream os) {
+        throw new UnsupportedOperationException("this class does not support getRdfWriter()");
+    }
+
+    @Override
+    @Deprecated
+    default String getMimeType() {
+        throw new UnsupportedOperationException("this class does not support getMimeType()");
+    }
+
+    @Override
+    @Deprecated
+    default RDFFormat getRdfFormat() {
+        throw new UnsupportedOperationException("this class does not support getRdfFormat()");
+    }
+}
\ No newline at end of file
index cbe5f9a..64830d8 100644 (file)
@@ -19,15 +19,21 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.ServiceConfigurationError;
 import java.util.ServiceLoader;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.CopyOnWriteArraySet;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.RDFFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -35,6 +41,7 @@ import org.slf4j.LoggerFactory;
  * Registry class for {@link WriterFactory}s.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
 public class WriterFactoryRegistry {
 
@@ -43,54 +50,54 @@ public class WriterFactoryRegistry {
     /**
      * Singleton instance.
      */
-    private static WriterFactoryRegistry instance;
+    private static class InstanceHolder {
+        private static final WriterFactoryRegistry instance = new WriterFactoryRegistry();
+    }
+
+    private static final WriterFactory[] EMPTY_WRITERS = new WriterFactory[0];
 
     /**
      * List of registered writers.
      */
-    private final List<WriterFactory> writers =
-            new ArrayList<>();
+    private final List<WriterFactory> writers = new CopyOnWriteArrayList<>();
 
     /**
-     * MIME Type to {@link FormatWriter} class.
+     * MIME Type to {@link WriterFactory} class.
      */
-    private final Map<String,List<WriterFactory>> mimeToWriter =
-            new HashMap<>();
+    private final Map<String, List<WriterFactory>> mimeToWriter = Collections.synchronizedMap(new HashMap<>());
 
     /**
-     * Identifier to {@link FormatWriter} class.
+     * Identifier to {@link WriterFactory} class.
      */
-    private final Map<String,WriterFactory> idToWriter =
-            new HashMap<>();
+    private final Map<String, WriterFactory> idToWriter = new HashMap<>();
 
-    private List<String> identifiers = new ArrayList<>();
+    private final List<String> identifiers = new CopyOnWriteArrayList<>();
+
+    private final Collection<String> mimeTypes = new CopyOnWriteArraySet<>();
 
     public WriterFactoryRegistry() {
-      ServiceLoader<WriterFactory> serviceLoader = java.util.ServiceLoader.load(WriterFactory.class, this.getClass().getClassLoader());
-      
-      Iterator<WriterFactory> iterator = serviceLoader.iterator();
+        ServiceLoader<WriterFactory> serviceLoader = java.util.ServiceLoader.load(WriterFactory.class, this.getClass().getClassLoader());
+
+        Iterator<WriterFactory> iterator = serviceLoader.iterator();
       
       // use while(true) loop so that we can isolate all service loader errors from .next and .hasNext to a single service
-      while(true)
-      {
-          try
-          {
-              if(!iterator.hasNext())
-                  break;
-              
-              WriterFactory factory = iterator.next();
-              
-              this.register(factory);
-          }
-          catch(ServiceConfigurationError error)
-          {
-              LOG.error("Found error loading a WriterFactory", error);
-          }
-      }
+
+        ArrayList<WriterFactory> factories = new ArrayList<>();
+        while (true) {
+            try {
+                if (!iterator.hasNext())
+                    break;
+                factories.add(iterator.next());
+            } catch(ServiceConfigurationError error) {
+                LOG.error("Found error loading a WriterFactory", error);
+            }
+        }
+
+        registerAll(factories.toArray(EMPTY_WRITERS));
     }
     
     /**
-     * Reads the identifier specified for the given {@link FormatWriter}.
+     * Reads the identifier specified for the given {@link WriterFactory}.
      *
      * @param writerClass writer class.
      * @return identifier.
@@ -100,97 +107,182 @@ public class WriterFactoryRegistry {
     }
 
     /**
-     * Reads the <i>MIME Type</i> specified for the given {@link FormatWriter}.
+     * Reads the <i>MIME Type</i> specified for the given {@link WriterFactory}.
      *
      * @param writerClass writer class.
      * @return MIME type.
      */
     public static String getMimeType(WriterFactory writerClass) {
-        return writerClass.getMimeType();
+        if (writerClass instanceof TripleWriterFactory) {
+            return ((TripleWriterFactory)writerClass).getTripleFormat().getMimeType();
+        } else if (writerClass instanceof DecoratingWriterFactory) {
+            return null;
+        } else {
+            return reportAndGetCompatFormat(writerClass).getMimeType();
+        }
     }
 
     /**
      * @return the {@link WriterFactoryRegistry} singleton instance.
      */
-    public static synchronized WriterFactoryRegistry getInstance() {
-        if(instance == null) {
-            instance = new WriterFactoryRegistry();
+    public static WriterFactoryRegistry getInstance() {
+        return InstanceHolder.instance;
+    }
+
+    @SuppressWarnings("deprecation")
+    private static TripleFormat reportAndGetCompatFormat(WriterFactory f) {
+        LOG.warn("{} must implement either {} or {}.", f.getClass(), TripleWriterFactory.class, DecoratingWriterFactory.class);
+        final String mimeType = f.getMimeType();
+        RDFFormat fmt;
+        try {
+            fmt = f.getRdfFormat();
+        } catch (RuntimeException e) {
+            return TripleFormat.of(mimeType, Collections.singleton(mimeType), null,
+                    Collections.emptySet(), null, TripleFormat.NONSTANDARD);
         }
-        return instance;
+        if (mimeType == null || fmt.hasDefaultMIMEType(mimeType)) {
+            return TripleFormat.of(fmt);
+        }
+        //override default MIME type on mismatch
+        return TripleFormat.of(fmt.getName(), Collections.singleton(mimeType), fmt.getCharset(),
+                fmt.getFileExtensions(), fmt.getStandardURI().stringValue(), TripleFormat.capabilities(fmt));
+    }
+
+    private static TripleWriterFactory getCompatFactory(WriterFactory f) {
+        final TripleFormat format = reportAndGetCompatFormat(f);
+        return new TripleWriterFactory() {
+            @Override
+            public TripleFormat getTripleFormat() {
+                return format;
+            }
+
+            @Override
+            @SuppressWarnings("deprecation")
+            public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+                return f.getRdfWriter(os);
+            }
+
+            @Override
+            public Settings getSupportedSettings() {
+                return Settings.of();
+            }
+
+            @Override
+            public String getIdentifier() {
+                return f.getIdentifier();
+            }
+        };
     }
 
     /**
      * Registers a new {@link WriterFactory} to the registry.
      *
-     * @param writerClass the class of the writer to be registered.
+     * @param f the writer factory to be registered.
      * @throws IllegalArgumentException if the id or the mimetype are null
      *                                  or empty strings or if the identifier has been already defined.
      */
-    public synchronized void register(WriterFactory writerClass) {
-        if(writerClass == null)
+    public void register(WriterFactory f) {
+        if (f == null)
             throw new NullPointerException("writerClass cannot be null.");
-        final String id       = writerClass.getIdentifier();
-        final String mimeType = writerClass.getMimeType();
-        if(id == null || id.trim().length() == 0) {
-            throw new IllegalArgumentException("Invalid identifier returned by writer " + writerClass);
+        registerAll(new WriterFactory[]{f});
+    }
+
+    private void registerAll(WriterFactory[] factories) {
+        final int count = factories.length;
+        if (count == 0) {
+            return;
         }
-        if(mimeType == null || mimeType.trim().length() == 0) {
-            throw new IllegalArgumentException("Invalid MIME type returned by writer " + writerClass);
+        final HashMap<String, ArrayList<WriterFactory>> mimes = new HashMap<>();
+        final String[] ids = new String[count];
+
+        for (int i = 0; i < count; i++) {
+            WriterFactory f = factories[i];
+            if (!(f instanceof BaseWriterFactory<?>)) {
+                //backwards compatibility: view vanilla WriterFactory as TripleWriterFactory
+                f = factories[i] = getCompatFactory(f);
+            }
+            final String id = ids[i] = f.getIdentifier();
+            if (id == null || id.trim().isEmpty()) {
+                throw new IllegalArgumentException("Invalid identifier returned by writer " + f);
+            }
+            if (f instanceof TripleWriterFactory) {
+                String mimeType = ((TripleWriterFactory)f).getTripleFormat().getMimeType();
+                if (mimeType == null || mimeType.trim().isEmpty()) {
+                    throw new IllegalArgumentException("Invalid MIME type returned by writer " + f);
+                }
+                mimes.computeIfAbsent(mimeType, k -> new ArrayList<>()).add(f);
+            }
+        }
+
+        final List<String> idList = Arrays.asList(ids);
+        final List<WriterFactory> factoryList = Arrays.asList(factories);
+        final Map<String, WriterFactory> idToWriter;
+        synchronized (idToWriter = this.idToWriter) {
+            for (int i = 0; i < count; i++) {
+                String id = ids[i];
+                if (idToWriter.putIfAbsent(id, factories[i]) != null) {
+                    idToWriter.keySet().removeAll(idList.subList(0, i));
+                    throw new IllegalArgumentException("The writer identifier is already declared: " + id);
+                }
+            }
         }
-        if(idToWriter.containsKey(id))
-            throw new IllegalArgumentException("The writer identifier is already declared.");
-
-        writers.add(writerClass);
-        identifiers.add(writerClass.getIdentifier());
-        List<WriterFactory> writerClasses = mimeToWriter.get(mimeType);
-        if(writerClasses == null) {
-            writerClasses = new ArrayList<>();
-            mimeToWriter.put(mimeType, writerClasses);
+        //add in bulk to reduce writes to CopyOnWriteArrayList
+        writers.addAll(factoryList);
+        identifiers.addAll(idList);
+        for (Map.Entry<String, ArrayList<WriterFactory>> entry : mimes.entrySet()) {
+            String mimeType = entry.getKey();
+            mimeTypes.add(mimeType);
+            mimeToWriter.computeIfAbsent(mimeType, k -> new CopyOnWriteArrayList<>()).addAll(entry.getValue());
         }
-        writerClasses.add(writerClass);
-        idToWriter.put(id, writerClass);
     }
 
     /**
-     * Verifies if a {@link FormatWriter} with given <code>id</code> identifier has been registered.
+     * Verifies if a {@link WriterFactory} with given <code>id</code> identifier has been registered.
      *
      * @param id identifier.
      * @return <code>true</code> if the identifier has been registered, <code>false</code> otherwise.
      */
-    public synchronized boolean hasIdentifier(String id) {
-        return idToWriter.containsKey(id);
+    public boolean hasIdentifier(String id) {
+        synchronized (idToWriter) {
+            return idToWriter.containsKey(id);
+        }
     }
 
     /**
      * @return the list of all the specified identifiers.
      */
-    public synchronized List<String> getIdentifiers() {
+    public List<String> getIdentifiers() {
+        //no synchronized block needed for CopyOnWriteArrayList
         return Collections.unmodifiableList(identifiers);
     }
 
     /**
-     * @return the list of MIME types covered by the registered {@link FormatWriter}s.
+     * @return the list of MIME types covered by the registered {@link WriterFactory} instances.
      */
-    public synchronized Collection<String> getMimeTypes() {
-        return Collections.unmodifiableCollection(mimeToWriter.keySet());
+    public Collection<String> getMimeTypes() {
+        //no synchronized block needed for CopyOnWriteArraySet
+        return Collections.unmodifiableCollection(mimeTypes);
     }
 
     /**
-     * @return the list of all the registered {@link FormatWriter}s.
+     * @return the list of all the registered {@link WriterFactory} instances.
      */
-    public synchronized List<WriterFactory> getWriters() {
+    public List<WriterFactory> getWriters() {
+        //no synchronized block needed for CopyOnWriteArrayList
         return Collections.unmodifiableList(writers);
     }
 
     /**
-     * Returns the {@link FormatWriter} identified by <code>id</code>.
+     * Returns the {@link WriterFactory} identified by <code>id</code>.
      *
      * @param id the writer identifier.
-     * @return the class of the {@link FormatWriter} matching the <code>id</code>
-     *         or <code>null</code> if not found.s
+     * @return the {@link WriterFactory} matching the <code>id</code>
+     *         or <code>null</code> if not found.
      */
-    public synchronized WriterFactory getWriterByIdentifier(String id) {
-        return idToWriter.get(id);
+    public WriterFactory getWriterByIdentifier(String id) {
+        synchronized (idToWriter) {
+            return idToWriter.get(id);
+        }
     }
 
     /**
@@ -199,42 +291,29 @@ public class WriterFactoryRegistry {
      * @param mimeType a MIMEType.
      * @return a list of matching writers or an empty list.
      */
-    public synchronized Collection<WriterFactory> getWritersByMimeType(String mimeType) {
-        return mimeToWriter.get(mimeType);
+    public Collection<WriterFactory> getWritersByMimeType(String mimeType) {
+        //no synchronized block needed for synchronized map
+        //return CopyOnWriteArrayList to avoid ConcurrentModificationExceptions on iteration
+        List<WriterFactory> list = mimeToWriter.get(mimeType);
+        return list != null ? Collections.unmodifiableList(list) : Collections.emptyList();
     }
 
     /**
-     * Returns an instance of {@link FormatWriter} ready to write on the given <code>os</code>
+     * Returns an instance of {@link FormatWriter} ready to write on the given
      * {@link OutputStream}.
      *
-     * @param id the identifier of the {@link FormatWriter} to crate an instance.
+     * @param id the identifier of the {@link FormatWriter} to instantiate.
      * @param os the output stream.
      * @return the not <code>null</code> {@link FormatWriter} instance.
      * @throws NullPointerException if the <code>id</code> doesn't match any registered writer.
-     */
-    public synchronized FormatWriter getWriterInstanceByIdentifier(String id, OutputStream os) {
-        final  WriterFactory writerClazz = getWriterByIdentifier(id);
-        if(writerClazz == null)
-            throw new NullPointerException(
-                String.format("Cannot find writer with id '%s' .", id)
-            );
-        return createWriter(writerClazz, os);
-    }
-
-    /**
-     * Crates a writer instance.
      *
-     * @param clazz class to instantiate.
-     * @param os output stream to pass as constructor argument.
-     * @return created instance.
-     * @throws IllegalArgumentException if an error occurs during instantiation.
+     * @deprecated since 2.3. Use {@link #getWriterByIdentifier(String)}
+     * in combination with {@link TripleWriterFactory#getTripleWriter(OutputStream, Settings)} instead.
      */
-    private FormatWriter createWriter(WriterFactory clazz, OutputStream os) {
-        try {
-            return clazz.getRdfWriter(os);
-        } catch (Exception e) {
-            throw new IllegalArgumentException("Error while initializing format writer " + clazz + " .", e);
-        }
+    @Deprecated
+    public FormatWriter getWriterInstanceByIdentifier(String id, OutputStream os) {
+        return Objects.requireNonNull(getWriterByIdentifier(id),
+                "Cannot find writer with id " + id).getRdfWriter(os);
     }
 
 }
diff --git a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java
new file mode 100644 (file)
index 0000000..a5a7b6e
--- /dev/null
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.configuration;
+
+import org.junit.Test;
+
+import java.lang.reflect.ParameterizedType;
+import java.lang.reflect.Type;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@SuppressWarnings("ResultOfMethodCallIgnored")
+public class SettingsTest {
+
+    @Test
+    public void testNonNullSetting() {
+        Setting<String> nonNull = Setting.newKey("nulltest", String.class).withValue("A nonnull string");
+        try {
+            nonNull.withValue(null);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+    }
+
+    @Test
+    public void testNullableSetting() {
+        Setting<String> nullable = Setting.newKey("nulltest", String.class).withValue(null);
+        assertNull(nullable.withValue(null).getValue());
+    }
+
+    @Test
+    public void testDuplicateIdentifiers() {
+        try {
+            Setting<String> first = Setting.newKey("foo", String.class).withValue("");
+            Setting<String> second = Setting.newKey("foo", String.class).withValue("");
+
+            Settings.of(first, second);
+
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+    }
+
+    @Test
+    public void testFind() {
+        Setting<String> key = Setting.newKey("foo", String.class).withValue("key");
+        Setting<String> element = key.withValue("element");
+
+        Settings settings = Settings.of(element);
+
+        Optional<Setting<String>> actual = settings.find(key);
+
+        assertTrue(actual.isPresent());
+
+        assertSame(element, actual.get());
+
+        assertTrue(settings.contains(element));
+        assertFalse(settings.contains(key));
+    }
+
+    @Test
+    public void testGetPresentSetting() {
+        Setting<String> key = Setting.newKey("foo", String.class).withValue("key");
+
+        Setting<String> actual = key.withValue("actual");
+        Settings settings = Settings.of(actual);
+
+        assertSame(actual.getValue(), settings.get(key));
+    }
+
+    @Test
+    public void testGetAbsentSetting() {
+        Setting<String> key = Setting.newKey("foo", String.class).withValue("key");
+
+        Setting<String> actual = Setting.newKey("foo", String.class).withValue("actual");
+        Settings settings = Settings.of(actual);
+
+        assertSame(key.getValue(), settings.get(key));
+    }
+
+    @Test
+    public void testGetNullSetting() {
+        Setting.Key<String> baseKey = Setting.newKey("foo", String.class);
+
+        Settings settings = Settings.of(baseKey.withValue(null));
+        assertNull(settings.get(baseKey.withValue("not null")));
+    }
+
+    @Test
+    public void testSettingType() {
+        assertEquals(CharSequence.class, Setting.newKey("foo", CharSequence.class).withValue("").getValueType());
+        assertEquals(CharSequence.class, new Setting.Key<CharSequence>("foo"){}.withValue("").getValueType());
+
+        Type mapType = new Setting.Key<Map<String, Integer>>(
+                "foo"){}.withValue(Collections.emptyMap()).getValueType();
+
+        assertTrue(mapType instanceof ParameterizedType);
+        assertEquals("java.util.Map<java.lang.String, java.lang.Integer>", mapType.getTypeName());
+
+        class Key0<Bar, V> extends Setting.Key<V> {
+            Key0() {
+                super("foo");
+            }
+        }
+
+        class Key2<Baz, V, Bar> extends Key0<V, Bar> { }
+
+        class Key3<V> extends Key2<Boolean, Integer, List<Optional<String>>> { }
+
+        class Key4 extends Key3<Boolean> { }
+
+        Type complicatedType = new Key4().withValue(Collections.emptyList()).getValueType();
+
+        assertTrue(complicatedType instanceof ParameterizedType);
+        assertEquals("java.util.List<java.util.Optional<java.lang.String>>", complicatedType.getTypeName());
+
+        class Key3Simple<V> extends Key2<Boolean, Integer, String> { }
+
+        class Key4Simple extends Key3Simple<Boolean> { }
+
+        Type simpleType = new Key4Simple().withValue("").getValueType();
+
+        assertEquals(String.class, simpleType);
+    }
+
+
+
+    @Test
+    public void testBadSetting() {
+        try {
+            new Setting.Key("foo") {};
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            Setting.newKey("foo", null);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            Setting.newKey(null, Integer.class);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            Setting.newKey(" ", Integer.class);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            Setting.newKey("foo", boolean.class);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            Setting.newKey("foo", Integer[].class);
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            new Setting.Key<Integer[]>("foo") {};
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        try {
+            new Setting.Key<List<Integer>[]>("foo") {};
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+
+        class BadKeyCreator {
+            private <V> void badKey() {
+                new Setting.Key<V>("foo") {};
+            }
+        }
+
+        try {
+            new BadKeyCreator().badKey();
+            fail();
+        } catch (IllegalArgumentException e) {
+            //test passes; ignore
+        }
+    }
+
+
+}
diff --git a/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java b/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java
new file mode 100644 (file)
index 0000000..f91202c
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+
+public class TripleFormatTest {
+
+    @Test
+    public void testRdf4jRoundTripping() {
+
+        RDFFormat[] formats = {
+                RDFFormat.TRIX, RDFFormat.NQUADS, RDFFormat.RDFA, RDFFormat.TRIG,
+                RDFFormat.N3, RDFFormat.RDFXML, RDFFormat.TURTLE, RDFFormat.JSONLD,
+                RDFFormat.NTRIPLES, RDFFormat.BINARY, RDFFormat.RDFJSON
+        };
+
+        for (RDFFormat expected : formats) {
+            TripleFormat tf = TripleFormat.of(expected);
+
+            RDFFormat actual = tf.toRDFFormat();
+            assertSame(expected, actual);
+
+            tf.rdfFormat = null;
+            actual = tf.toRDFFormat();
+            assertNotSame(expected, actual);
+
+            assertEquals(expected.getName(), actual.getName());
+            assertEquals(expected.getStandardURI(), actual.getStandardURI());
+            assertEquals(expected.getCharset(), actual.getCharset());
+            assertEquals(expected.getFileExtensions(), actual.getFileExtensions());
+            assertEquals(expected.supportsContexts(), actual.supportsContexts());
+            assertEquals(expected.supportsNamespaces(), actual.supportsNamespaces());
+        }
+
+    }
+}
index 5b49b39..ef912f7 100644 (file)
@@ -25,13 +25,18 @@ import com.beust.jcommander.converters.FileConverter;
 import org.apache.any23.Any23;
 import org.apache.any23.configuration.Configuration;
 import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.any23.configuration.Setting;
+import org.apache.any23.configuration.Settings;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
 import org.apache.any23.filter.IgnoreAccidentalRDFa;
 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.writer.BenchmarkTripleHandler;
+import org.apache.any23.writer.DecoratingWriterFactory;
+import org.apache.any23.writer.TripleWriterFactory;
 import org.apache.any23.writer.LoggingTripleHandler;
+import org.apache.any23.writer.NTriplesWriterFactory;
 import org.apache.any23.writer.ReportingTripleHandler;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
@@ -41,12 +46,16 @@ import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.FileNotFoundException;
+import java.io.OutputStream;
 import java.io.PrintStream;
 import java.io.PrintWriter;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.ListIterator;
+import java.util.Objects;
 
 import static java.lang.String.format;
 
@@ -57,15 +66,42 @@ import static java.lang.String.format;
  * @author Michele Mostarda (mostarda@fbk.eu)
  * @author Richard Cyganiak (richard@cyganiak.de)
  * @author Gabriele Renzi
+ * @author Hans Brende (hansbrende@apache.org)
  */
 @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.")
 public class Rover extends BaseTool {
 
-    private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers();
+    private static final Logger logger = LoggerFactory.getLogger(Rover.class);
 
-    private static final int DEFAULT_FORMAT_INDEX = 0;
+    private static final WriterFactoryRegistry registry = WriterFactoryRegistry.getInstance();
+    private static final String DEFAULT_WRITER_IDENTIFIER = NTriplesWriterFactory.IDENTIFIER;
+
+    static {
+        final Setting<Boolean> ALWAYS_SUPPRESS_CSS_TRIPLES = Setting.newKey(
+                "alwayssuppresscsstriples", Boolean.class)
+                .withValue(Boolean.TRUE);
+        final Settings supportedSettings = Settings.of(ALWAYS_SUPPRESS_CSS_TRIPLES);
+
+        registry.register(new DecoratingWriterFactory() {
+
+            @Override
+            public TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) {
+                boolean always = settings.get(ALWAYS_SUPPRESS_CSS_TRIPLES);
+                return new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(delegate), always);
+            }
+
+            @Override
+            public Settings getSupportedSettings() {
+                return supportedSettings;
+            }
+
+            @Override
+            public String getIdentifier() {
+                return "notrivial";
+            }
+        });
+    }
 
-    private static final Logger logger = LoggerFactory.getLogger(Rover.class);
 
     @Parameter(
        names = { "-o", "--output" },
@@ -80,8 +116,10 @@ public class Rover extends BaseTool {
     @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle")
     private List<String> extractors = new LinkedList<>();
 
-    @Parameter(names = { "-f", "--format" }, description = "the output format")
-    private String format = FORMATS.get(DEFAULT_FORMAT_INDEX);
+    @Parameter(names = { "-f", "--format" }, description = "a comma-separated list of writer factories, e.g. notrivial,nquads")
+    private List<String> formats = new LinkedList<String>() {{
+        add(DEFAULT_WRITER_IDENTIFIER);
+    }};
 
     @Parameter(
        names = { "-l", "--log" },
@@ -93,7 +131,7 @@ public class Rover extends BaseTool {
     @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
     private boolean statistics;
 
-    @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).")
+    @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones). [DEPRECATED: As of version 2.3, use --format instead.]")
     private boolean noTrivial;
 
     @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
@@ -127,16 +165,28 @@ public class Rover extends BaseTool {
         outputStream = out;
     }
 
+    private static TripleHandler getWriter(String id, OutputStream os) {
+        TripleWriterFactory f = (TripleWriterFactory)registry.getWriterByIdentifier(id);
+        Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
+        return f.getTripleWriter(os, Settings.of()); //TODO parse TripleWriter settings from format list
+    }
+
+    private static TripleHandler getWriter(String id, TripleHandler delegate) {
+        DecoratingWriterFactory f = (DecoratingWriterFactory)registry.getWriterByIdentifier(id);
+        Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
+        return f.getTripleWriter(delegate, Settings.of()); //TODO parse delegate settings from format list
+    }
+
     protected void configure() {
-        try {
-            tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream);
-        } catch (Exception e) {
-            throw new NullPointerException(
-                    format("Invalid output format '%s', admitted values: %s",
-                        format,
-                        FORMATS
-                    )
-            );
+        List<String> formats = this.formats;
+        if (formats.isEmpty()) {
+            formats = Collections.singletonList(DEFAULT_WRITER_IDENTIFIER);
+        }
+        ListIterator<String> l = formats.listIterator(formats.size());
+        tripleHandler = getWriter(l.previous(), outputStream);
+
+        while (l.hasPrevious()) {
+            tripleHandler = getWriter(l.previous(), tripleHandler);
         }
 
         if (logFile != null) {
diff --git a/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java b/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java
new file mode 100644 (file)
index 0000000..0b75f57
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.cli;
+
+import org.apache.any23.cli.flows.PeopleExtractor;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.commons.io.FileUtils;
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.model.impl.TreeModel;
+import org.eclipse.rdf4j.rio.Rio;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
+import java.util.stream.Stream;
+
+/**
+ * This is example for task ANY23-396
+ *
+ * @author Jacek Grzebyta (jgrzebyta@apache.org)
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public class ExtractorsFlowTest extends ToolTestBase {
+
+    private static final String testingDatafile = "/org/apache/any23/extractor/csv/test-comma.csv";
+    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    public ExtractorsFlowTest() {
+        super(Rover.class);
+    }
+
+    /**
+     * Emulates action described in ANY23-396.
+     */
+    @Test
+    public void runTestFor396() throws Exception {
+        File outputFile = File.createTempFile("mockdata-", ".ttl", tempDirectory);
+        File logFile = File.createTempFile("log-exec-", ".txt", tempDirectory);
+
+        runTool(String.format("-l %s -o %s -f people,turtle -e csv -d %s %s",
+                logFile.getAbsolutePath(),
+                outputFile.getAbsolutePath(),
+                PeopleExtractor.RAW_NS,
+                copyResourceToTempFile(testingDatafile).getAbsolutePath()));
+
+        // populate expected model
+        Model expected = new TreeModel();
+        Stream.of("Davide Palmisano", "Michele Mostarda", "Giovanni Tummarello")
+                .map(PeopleExtractor::createPerson).forEach(expected::addAll);
+
+        if (log.isDebugEnabled()) {
+            log.debug("\n\nlog file content:\n{}", FileUtils.readFileToString(logFile, "utf-8"));
+            log.debug("\n\nData file: \n{}", FileUtils.readFileToString(outputFile, "utf-8"));
+        }
+
+        Assert.assertTrue(assertCompareModels(expected, outputFile));
+    }
+
+    /**
+     * Compare expected model and received from input File.
+     */
+    private boolean assertCompareModels(Model expected, File received) throws Exception {
+        Model receivedModel = new TreeModel();
+        receivedModel.addAll(Arrays.asList(RDFUtils.parseRDF(
+                Rio.getParserFormatForFileName(received.getName()).orElseThrow(AssertionError::new),
+                new BufferedInputStream(new FileInputStream(received)),
+                received.toURI().toString()
+        )));
+
+        return receivedModel.containsAll(expected);
+    }
+}
index 15054e4..c2b7a86 100644 (file)
@@ -90,6 +90,45 @@ public class RoverTest extends ToolTestBase {
         Assert.assertEquals(0, graphCounter);
     }
 
+    @Test
+    public void testDelegatingWriterFactory() throws Exception {
+        final File outFile = File.createTempFile("rover-test", "out", tempDirectory);
+        final String DEFAULT_GRAPH = "http://test/default/ns";
+        final String stylesheet = "http://www.w3.org/1999/xhtml/vocab#stylesheet";
+
+        Assert.assertEquals("Unexpected exit code.", 0, runTool(
+                String.format(
+                        "-o %s -f nquads %s -d %s",
+                        outFile.getAbsolutePath(),
+                        copyResourceToTempFile("/cli/basic-with-stylesheet.html").getAbsolutePath(),
+                        DEFAULT_GRAPH
+                )
+        ));
+
+        String content = FileUtils.readFileContent(outFile);
+
+        Assert.assertTrue(content.contains(stylesheet));
+
+        final int lineCountWithStylesheet = content.split("\\n").length;
+
+        Assert.assertEquals("Unexpected exit code.", 0, runTool(
+                String.format(
+                        "-o %s -f notrivial,nquads %s -d %s",
+                        outFile.getAbsolutePath(),
+                        copyResourceToTempFile("/cli/basic-with-stylesheet.html").getAbsolutePath(),
+                        DEFAULT_GRAPH
+                )
+        ));
+
+        content = FileUtils.readFileContent(outFile);
+
+        Assert.assertTrue(!content.contains(stylesheet));
+
+        final int lineCountWithoutStylesheet = content.split("\\n").length;
+
+        Assert.assertEquals(lineCountWithStylesheet - 1, lineCountWithoutStylesheet);
+    }
+
     /* BEGIN: online tests. */
 
     @Test
diff --git a/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java
new file mode 100644 (file)
index 0000000..d1f31c0
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except csvModel compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to csvModel writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.cli.flows;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.vocab.CSV;
+import org.apache.any23.writer.CompositeTripleHandler;
+import org.apache.any23.writer.TripleHandler;
+import org.apache.any23.writer.TripleHandlerException;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Literal;
+import org.eclipse.rdf4j.model.Model;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Statement;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.ValueFactory;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.model.impl.TreeModel;
+import org.eclipse.rdf4j.model.util.Models;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Collections;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Proof of concept for ANY23-396 example.
+ */
+public class PeopleExtractor extends CompositeTripleHandler {
+
+    private Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private static final CSV csv = CSV.getInstance();
+    private static final ValueFactory vf = SimpleValueFactory.getInstance();
+    public static final String RAW_NS = "urn:dataser:raw/";
+    private static final IRI RAW_FIRST_NAME = vf.createIRI(RAW_NS, "FirstName");
+    private static final IRI RAW_LAST_NAME = vf.createIRI(RAW_NS, "LastName");
+
+    private static final String NAMESPACE = "http://supercustom.net/ontology/";
+    private static final IRI PERSON = vf.createIRI(NAMESPACE, "Person");
+    private static final IRI FULL_NAME = vf.createIRI(NAMESPACE, "fullName");
+    private static final IRI HASH = vf.createIRI(NAMESPACE, "hash");
+
+    public static Model createPerson(String fullName) {
+        IRI s = vf.createIRI("http://rdf.supercustom.net/data/", DigestUtils.sha1Hex(fullName));
+        Model model = new TreeModel();
+        model.add(s, RDF.TYPE, PERSON);
+        model.add(s, FULL_NAME, vf.createLiteral(fullName));
+        model.add(s, HASH, vf.createLiteral(s.getLocalName(), XMLSchema.HEXBINARY));
+        return model;
+    };
+
+    private final Model csvModel = new TreeModel();
+
+    public PeopleExtractor(TripleHandler delegate) {
+        super(Collections.singletonList(delegate));
+    }
+
+    @Override
+    public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException {
+        if ("csv".equals(context.getExtractorName())) {
+            csvModel.add(s, p, o, vf.createIRI(context.getUniqueID()));
+        } else {
+            super.receiveTriple(s, p, o, g, context);
+        }
+    }
+
+    @Override
+    public void closeContext(ExtractionContext context) throws TripleHandlerException {
+        Set<Resource> subjects = csvModel.filter(null, RDF.TYPE, csv.rowType)
+                .stream().map(Statement::getSubject).collect(Collectors.toSet());
+
+        log.debug("List of rows: {}", subjects);
+
+        for (Resource rowId : subjects) {
+            String firstName = Models.objectLiteral(csvModel.filter(rowId, RAW_FIRST_NAME, null))
+                    .map(Literal::getLabel).orElse("");
+
+            String lastName = Models.objectLiteral(csvModel.filter(rowId, RAW_LAST_NAME, null))
+                    .map(Literal::getLabel).orElse("");
+
+            String fullName = firstName + " " + lastName;
+
+            for (Statement s : createPerson(fullName)) {
+                super.receiveTriple(s.getSubject(), s.getPredicate(), s.getObject(), null, context);
+            }
+        }
+
+        csvModel.clear();
+
+        super.closeContext(context);
+    }
+
+}
\ No newline at end of file
diff --git a/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java
new file mode 100644 (file)
index 0000000..75d4c61
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.cli.flows;
+
+import org.apache.any23.configuration.Settings;
+import org.apache.any23.writer.DecoratingWriterFactory;
+import org.apache.any23.writer.TripleHandler;
+
+public class PeopleExtractorFactory implements DecoratingWriterFactory {
+
+    @Override
+    public String getIdentifier() {
+        return "people";
+    }
+
+    @Override
+    public TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) {
+        return new PeopleExtractor(delegate);
+    }
+
+    @Override
+    public Settings getSupportedSettings() {
+        return Settings.of();
+    }
+}
\ No newline at end of file
diff --git a/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory b/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory
new file mode 100644 (file)
index 0000000..c595410
--- /dev/null
@@ -0,0 +1 @@
+org.apache.any23.cli.flows.PeopleExtractorFactory
index 50d3900..4edf9be 100644 (file)
  */
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.WriterConfig;
+import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings;
+
 import java.io.OutputStream;
-import org.eclipse.rdf4j.rio.RDFFormat;
-import org.eclipse.rdf4j.rio.Rio;
 
 /**
- * Implementation of <i>JSON-LD</i> format writer.
+ * Implementation of <i>JSON-LD</i> {@link TripleWriter}.
  *
  * @author Julio Caguano
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class JSONLDWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class JSONLDWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        private static final org.eclipse.rdf4j.rio.jsonld.JSONLDWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.jsonld.JSONLDWriterFactory();
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of(
+                WriterSettings.PRETTY_PRINT
+        );
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+        config.set(BasicWriterSettings.PRETTY_PRINT, settings.get(WriterSettings.PRETTY_PRINT));
+    }
+
+
 
     public JSONLDWriter(OutputStream os) {
-        super(Rio.createWriter(RDFFormat.JSONLD, os));
+        this(os, Settings.of());
     }
+
+    public JSONLDWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
+    }
+
 }
index df20279..482b0a9 100644 (file)
 package org.apache.any23.writer;
 
 import java.io.OutputStream;
-import org.eclipse.rdf4j.rio.RDFFormat;
+
+import org.apache.any23.configuration.Settings;
 
 /**
  *
  * @author Julio Caguano.
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class JSONLDWriterFactory implements WriterFactory {
+public class JSONLDWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.JSONLD.getDefaultMIMEType();
+    public static final String MIME_TYPE = JSONLDWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "jsonld";
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.JSONLD;
+    public TripleFormat getTripleFormat() {
+        return JSONLDWriter.Internal.FORMAT;
     }
 
     @Override
     public String getIdentifier() {
-        return JSONLDWriterFactory.IDENTIFIER;
+        return IDENTIFIER;
     }
 
     @Override
-    public String getMimeType() {
-        return JSONLDWriterFactory.MIME_TYPE;
+    public TripleHandler getTripleWriter(OutputStream out, Settings settings) {
+        return new JSONLDWriter(out, settings);
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new JSONLDWriter(os);
+    public Settings getSupportedSettings() {
+        return JSONLDWriter.Internal.SUPPORTED_SETTINGS;
     }
 
 }
index 70e2700..58d869a 100644 (file)
@@ -22,7 +22,8 @@ import com.fasterxml.jackson.core.util.DefaultPrettyPrinter;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.Optional;
-import org.apache.any23.extractor.ExtractionContext;
+
+import org.apache.any23.configuration.Settings;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Literal;
@@ -30,11 +31,11 @@ import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.Value;
 
 /**
- * Implementation of <i>JSON</i> format writer.
+ * Implementation of <i>JSON</i> {@link TripleWriter}.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
-public class JSONWriter implements FormatWriter {
+public class JSONWriter extends TripleWriterHandler implements FormatWriter {
 
     private JsonGenerator ps;
     private boolean documentStarted = false;
@@ -46,18 +47,21 @@ public class JSONWriter implements FormatWriter {
         JsonFactory factory = new JsonFactory();
         try {
             this.ps = factory.createGenerator(os)
+                    .disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET)
+                    .enable(JsonGenerator.Feature.FLUSH_PASSED_TO_STREAM)
                     .setPrettyPrinter(new DefaultPrettyPrinter());
         } catch (IOException ex) {
         }
     }
 
-    @Override
-    public void startDocument(IRI documentIRI) throws TripleHandlerException {
+    private void start(boolean throwIfStarted) throws TripleHandlerException {
         if (documentStarted) {
-            throw new IllegalStateException("Document already started.");
+            if (throwIfStarted) {
+                throw new IllegalStateException("Document already started.");
+            }
+            return;
         }
         documentStarted = true;
-
         try {
             ps.writeStartObject();
             ps.writeFieldName("quads");
@@ -68,14 +72,14 @@ public class JSONWriter implements FormatWriter {
     }
 
     @Override
-    public void openContext(ExtractionContext context) throws TripleHandlerException {
-        // Empty.
+    public void startDocument(IRI documentIRI) throws TripleHandlerException {
+        start(true);
     }
 
     @Override
-    public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
+    public void writeTriple(Resource s, IRI p, Value o, Resource g)
             throws TripleHandlerException {
-        validateDocumentStarted();
+        start(false);
         try {
             ps.writeStartArray();
 
@@ -104,43 +108,28 @@ public class JSONWriter implements FormatWriter {
     }
 
     @Override
-    public void receiveNamespace(String prefix, String uri, ExtractionContext context)
+    public void writeNamespace(String prefix, String uri)
             throws TripleHandlerException {
         // Empty.
     }
 
     @Override
-    public void closeContext(ExtractionContext context) throws TripleHandlerException {
-        // Empty.
-    }
-
-    @Override
     public void endDocument(IRI documentIRI) throws TripleHandlerException {
         validateDocumentStarted();
-
-        try {
-            ps.writeEndArray();
-            ps.writeEndObject();
-            documentStarted = false;
-        } catch (IOException ex) {
-            throw new TripleHandlerException("IO Error while closing document.", ex);
-        }
-    }
-
-    @Override
-    public void setContentLength(long contentLength) {
-        // Empty.
     }
 
     @Override
     public void close() throws TripleHandlerException {
-        if (documentStarted) {
-            endDocument(null);
-        }
+        start(false);
+
         try {
+            ps.writeEndArray();
+            ps.writeEndObject();
             ps.close();
         } catch (IOException ex) {
-            throw new TripleHandlerException("IO Error while closing stream.", ex);
+            throw new TripleHandlerException("IO Error while closing document.", ex);
+        } finally {
+            ps = null;
         }
     }
 
index eea4def..8877a25 100644 (file)
 
 package org.apache.any23.writer;
 
-import java.io.OutputStream;
+import org.apache.any23.configuration.Settings;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
 
 /**
  * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class JSONWriterFactory implements WriterFactory {
+public class JSONWriterFactory implements TripleWriterFactory {
+
 
     public static final String MIME_TYPE = "text/json";
     public static final String IDENTIFIER = "json";
 
+    private static final TripleFormat FORMAT = TripleFormat.of("JSON", Collections.singleton(MIME_TYPE),
+            StandardCharsets.UTF_8, Collections.emptySet(), null, TripleFormat.QUADS);
     /**
      * 
      */
@@ -37,24 +42,23 @@ public class JSONWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        throw new RuntimeException(
-                "TODO: Implement an RDFFormat for this RDF JSON serialisation format");
+    public TripleFormat getTripleFormat() {
+        return FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return JSONWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return Settings.of();
     }
 
     @Override
-    public String getMimeType() {
-        return JSONWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return JSONWriterFactory.IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new JSONWriter(os);
+    public TripleHandler getTripleWriter(OutputStream out, Settings settings) {
+        return new JSONWriter(out);
     }
 
 }
index 359f62c..ebbd9c2 100644 (file)
 
 package org.apache.any23.writer;
 
-import java.io.OutputStream;
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.WriterConfig;
+import org.eclipse.rdf4j.rio.helpers.NTriplesWriterSettings;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
-import org.eclipse.rdf4j.rio.Rio;
+import java.io.OutputStream;
 
 /**
- * Implementation of an <i>NQuads</i> writer.
+ * Implementation of an <i>N-Quads</i> {@link TripleWriter}.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class NQuadsWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class NQuadsWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        private static final org.eclipse.rdf4j.rio.nquads.NQuadsWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.nquads.NQuadsWriterFactory();
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of(
+                WriterSettings.PRINT_ASCII
+        );
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+        config.set(NTriplesWriterSettings.ESCAPE_UNICODE, settings.get(WriterSettings.PRINT_ASCII));
+    }
+
 
     public NQuadsWriter(OutputStream os) {
-        super( Rio.createWriter(RDFFormat.NQUADS, os) );
+        this(os, Settings.of());
+    }
+
+    public NQuadsWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
     }
 
 }
index 964d53c..767f2ae 100644 (file)
@@ -19,15 +19,15 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
- * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Peter Ansell (p_ansell@yahoo.com)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class NQuadsWriterFactory implements WriterFactory {
+public class NQuadsWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.NQUADS.getDefaultMIMEType();
+    public static final String MIME_TYPE = NQuadsWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "nquads";
 
     /**
@@ -37,23 +37,23 @@ public class NQuadsWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.NQUADS;
+    public TripleFormat getTripleFormat() {
+        return NQuadsWriter.Internal.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return NQuadsWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return NQuadsWriter.Internal.SUPPORTED_SETTINGS;
     }
 
     @Override
-    public String getMimeType() {
-        return NQuadsWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new NQuadsWriter(os);
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+        return new NQuadsWriter(os, settings);
     }
 
 }
index 0d862ae..933b185 100644 (file)
 
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.WriterConfig;
+import org.eclipse.rdf4j.rio.helpers.NTriplesWriterSettings;
+
 import java.io.OutputStream;
 
 /**
- * <i>N3</i> triples writer.
+ * Implementation of an <i>N-Triples</i> {@link TripleWriter}.
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class NTriplesWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class NTriplesWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        private static final org.eclipse.rdf4j.rio.ntriples.NTriplesWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.ntriples.NTriplesWriterFactory();
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of(
+                WriterSettings.PRINT_ASCII
+        );
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+        config.set(NTriplesWriterSettings.ESCAPE_UNICODE, settings.get(WriterSettings.PRINT_ASCII));
+    }
 
     public NTriplesWriter(OutputStream out) {
-        super(new org.eclipse.rdf4j.rio.ntriples.NTriplesWriter(out));
+        this(out, Settings.of());
+    }
+
+    public NTriplesWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
     }
 
 }
index 91d5fed..a631347 100644 (file)
@@ -19,15 +19,15 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
- * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Peter Ansell (p_ansell@yahoo.com)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class NTriplesWriterFactory implements WriterFactory {
+public class NTriplesWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.NTRIPLES.getDefaultMIMEType();
+    public static final String MIME_TYPE = NTriplesWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "ntriples";
 
     /**
@@ -37,23 +37,23 @@ public class NTriplesWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.NTRIPLES;
+    public TripleFormat getTripleFormat() {
+        return NTriplesWriter.Internal.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return NTriplesWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return NTriplesWriter.Internal.SUPPORTED_SETTINGS;
     }
 
     @Override
-    public String getMimeType() {
-        return NTriplesWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return NTriplesWriterFactory.IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new NTriplesWriter(os);
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+        return new NTriplesWriter(os, settings);
     }
 
 }
index aaf4105..c237ff5 100644 (file)
@@ -17,6 +17,7 @@
 
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.rdf.RDFUtils;
 import org.eclipse.rdf4j.model.Resource;
@@ -24,6 +25,16 @@ import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.rio.RDFHandlerException;
 import org.eclipse.rdf4j.rio.RDFWriter;
+import org.eclipse.rdf4j.rio.RDFWriterFactory;
+import org.eclipse.rdf4j.rio.WriterConfig;
+
+import java.io.BufferedWriter;
+import java.io.Flushable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.util.Optional;
 
 /**
  * A {@link TripleHandler} that writes
@@ -32,25 +43,56 @@ import org.eclipse.rdf4j.rio.RDFWriter;
  *
  * @author Richard Cyganiak (richard@cyganiak.de)
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHandler {
-
-    protected final RDFWriter writer;
+public abstract class RDFWriterTripleHandler extends TripleWriterHandler implements FormatWriter {
 
-    private boolean closed = false;
+    private RDFWriter _writer;
+    private boolean writerStarted;
+    private final Flushable out;
+    private final TripleFormat format;
 
     /**
      * The annotation flag.
      */
     private boolean annotated = false;
 
-    protected RDFWriterTripleHandler(RDFWriter destination) {
-        writer = destination;
-        try {
-            writer.startRDF();
-        } catch (RDFHandlerException e) {
-            throw new RuntimeException(e);
+    static TripleFormat format(RDFWriterFactory rdf4j) {
+        return TripleFormat.of(rdf4j.getRDFFormat());
+    }
+
+    RDFWriterTripleHandler(RDFWriterFactory rdf4j, TripleFormat format, OutputStream out, Settings settings) {
+        this.format = format;
+        Optional<Charset> charset = format.getCharset();
+        RDFWriter w;
+        if (!charset.isPresent()) {
+            this.out = out;
+            w = _writer = rdf4j.getWriter(out);
+        } else {
+            //use buffered writer if format supports encoding
+            BufferedWriter buf = new BufferedWriter(new OutputStreamWriter(out, charset.get()));
+            this.out = buf;
+            w = _writer = rdf4j.getWriter(buf);
+        }
+        configure(w.getWriterConfig(), settings);
+    }
+
+    abstract void configure(WriterConfig config, Settings settings);
+
+    RDFWriter writer() throws TripleHandlerException {
+        RDFWriter w = _writer;
+        if (w == null) {
+            throw new TripleHandlerException("writer has been closed!");
+        }
+        if (!writerStarted) {
+            writerStarted = true;
+            try {
+                w.startRDF();
+            } catch (RDFHandlerException e) {
+                throw new TripleHandlerException("Error while starting document", e);
+            }
         }
+        return w;
     }
 
     /**
@@ -77,7 +119,7 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand
 
     @Override
     public void startDocument(IRI documentIRI) throws TripleHandlerException {
-        handleComment("OUTPUT FORMAT: " + writer.getRDFFormat());
+        handleComment("OUTPUT FORMAT: " + format);
     }
 
     @Override
@@ -86,25 +128,23 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand
     }
 
     @Override
-    public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
+    public void writeTriple(Resource s, IRI p, Value o, Resource g)
     throws TripleHandlerException {
-        final IRI graph = g == null ? context.getDocumentIRI() : g;
         try {
-            writer.handleStatement(
-                    RDFUtils.quad(s, p, o, graph));
+            writer().handleStatement(RDFUtils.quad(s, p, o, g));
         } catch (RDFHandlerException ex) {
             throw new TripleHandlerException(
-                    String.format("Error while receiving triple: %s %s %s %s", s, p, o, graph),
+                    String.format("Error while receiving triple: %s %s %s %s", s, p, o, g),
                     ex
             );
         }
     }
 
     @Override
-    public void receiveNamespace(String prefix, String uri, ExtractionContext context)
+    public void writeNamespace(String prefix, String uri)
     throws TripleHandlerException {
         try {
-            writer.handleNamespace(prefix, uri);
+            writer().handleNamespace(prefix, uri);
         } catch (RDFHandlerException ex) {
             throw new TripleHandlerException(String.format("Error while receiving namespace: %s:%s", prefix, uri),
                     ex
@@ -119,32 +159,36 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand
 
     @Override
     public void close() throws TripleHandlerException {
-        if (closed) return;
-        closed = true;
+        RDFWriter writer = _writer;
+        if (writer == null) {
+            return;
+        }
+        _writer = null;
         try {
-            writer.endRDF();
+            if (!writerStarted) {
+                writer.startRDF();
+            }
+            writer.endRDF(); //calls flush()
         } catch (RDFHandlerException e) {
-            throw new TripleHandlerException("Error while closing the triple handler.", e);
+            throw new TripleHandlerException("Error closing writer", e);
         }
     }
 
     @Override
     public void endDocument(IRI documentIRI) throws TripleHandlerException {
-        // Empty.
-    }
-
-    @Override
-    public void setContentLength(long contentLength) {
-        // Empty.
+        try {
+            out.flush();
+        } catch (IOException e) {
+            throw new TripleHandlerException("Error ending document", e);
+        }
     }
 
     private void handleComment(String comment) throws TripleHandlerException {
         if( !annotated ) return;
         try {
-            writer.handleComment(comment);
+            writer().handleComment(comment);
         } catch (RDFHandlerException rdfhe) {
             throw new TripleHandlerException("Error while handing comment.", rdfhe);
         }
     }
-
 }
index ecbf6ed..1f8c127 100644 (file)
 
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.WriterConfig;
+
 import java.io.OutputStream;
 
 /**
- * <i>RDF/XML</i> writer implementation.
+ * <i>RDF/XML</i> {@link TripleWriter} implementation.
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class RDFXMLWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class RDFXMLWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        private static final org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriterFactory();
+
+        //TODO support pretty printing with RDFXMLPrettyWriterFactory
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of();
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+    }
+
+    public RDFXMLWriter(OutputStream os) {
+        this(os, Settings.of());
+    }
 
-    public RDFXMLWriter(OutputStream out) {
-        super( new org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriter(out) );
+    public RDFXMLWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
     }
 
 }
index c40bca3..a3dceb6 100644 (file)
@@ -19,15 +19,15 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
- * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Peter Ansell (p_ansell@yahoo.com)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class RDFXMLWriterFactory implements WriterFactory {
+public class RDFXMLWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.RDFXML.getDefaultMIMEType();
+    public static final String MIME_TYPE = RDFXMLWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "rdfxml";
 
     /**
@@ -37,23 +37,23 @@ public class RDFXMLWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.RDFXML;
+    public TripleFormat getTripleFormat() {
+        return RDFXMLWriter.Internal.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return RDFXMLWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return RDFXMLWriter.Internal.SUPPORTED_SETTINGS;
     }
 
     @Override
-    public String getMimeType() {
-        return RDFXMLWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new RDFXMLWriter(os);
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+        return new RDFXMLWriter(os, settings);
     }
 
 }
index 6ae871b..e048c17 100644 (file)
 
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.rio.WriterConfig;
+
 import java.io.OutputStream;
 
 /**
- * <a href="http://www.w3.org/2004/03/trix/">TriX</a> format writer implementation.
+ * <a href="http://www.w3.org/2004/03/trix/">TriX</a> {@link TripleWriter} implementation.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class TriXWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class TriXWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        private static final org.eclipse.rdf4j.rio.trix.TriXWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.trix.TriXWriterFactory();
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of();
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+    }
+
+    public TriXWriter(OutputStream os) {
+        this(os, Settings.of());
+    }
 
-    public TriXWriter(OutputStream out) {
-        super( new org.eclipse.rdf4j.rio.trix.TriXWriter(out) );
+    public TriXWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
     }
 
 }
index 0facc59..09fbfb8 100644 (file)
@@ -19,15 +19,15 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
  * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class TriXWriterFactory implements WriterFactory {
+public class TriXWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.TRIX.getDefaultMIMEType();
+    public static final String MIME_TYPE = TriXWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "trix";
 
     /**
@@ -37,23 +37,23 @@ public class TriXWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.TRIX;
+    public TripleFormat getTripleFormat() {
+        return TriXWriter.Internal.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return TriXWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return TriXWriter.Internal.SUPPORTED_SETTINGS;
     }
 
     @Override
-    public String getMimeType() {
-        return TriXWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new TriXWriter(os);
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+        return new TriXWriter(os, settings);
     }
 
 }
diff --git a/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java b/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java
new file mode 100644 (file)
index 0000000..56fcdc3
--- /dev/null
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Resource;
+import org.eclipse.rdf4j.model.Value;
+
+/**
+ * This class connects a {@link TripleHandler} to a {@link TripleWriter} by writing received data.
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public abstract class TripleWriterHandler implements TripleHandler, TripleWriter {
+
+    /**
+     * Writers may override this method to handle a "receiveTriple" extraction event.
+     * The default implementation calls:
+     * <pre>
+     *     {@code this.writeTriple(s, p, o, context == null || g != null ? g : context.getDocumentIRI())}
+     * </pre>
+     * @param s the subject received
+     * @param p the predicate received
+     * @param o the object received
+     * @param g the graph name received, or null
+     * @param context the extraction context
+     * @throws TripleHandlerException if there was an error responding to a received triple
+     */
+    @Override
+    public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException {
+        writeTriple(s, p, o, context == null || g != null ? g : context.getDocumentIRI());
+    }
+
+    /**
+     * Writers may override this method to handle a "receiveNamespace" extraction event.
+     * The default implementation calls:
+     * <pre>
+     *     {@code this.writeNamespace(prefix, uri)}
+     * </pre>
+     * @param prefix namespace prefix.
+     * @param uri namespace <i>IRI</i>.
+     * @param context the extraction context
+     * @throws TripleHandlerException if there was an error responding to the received namepsace.
+     */
+    @Override
+    public void receiveNamespace(String prefix, String uri, ExtractionContext context) throws TripleHandlerException {
+        writeNamespace(prefix, uri);
+    }
+
+    /**
+     * Writers may override this method to handle a "startDocument" extraction event.
+     * The default implementation does nothing.
+     * @param documentIRI the name of the document that was started
+     * @throws TripleHandlerException if an error occurred while responding to a "startDocument"
+     * extraction event.
+     */
+    @Override
+    public void startDocument(IRI documentIRI) throws TripleHandlerException { }
+
+    /**
+     * Writers may override this method to handle an "openContext" extraction event.
+     * The default implementation does nothing.
+     * @param context the context that was opened
+     * @throws TripleHandlerException if an error occurred while responding to a "startDocument"
+     * extraction event.
+     */
+    @Override
+    public void openContext(ExtractionContext context) throws TripleHandlerException { }
+
+    /**
+     * Writers may override this method to handle a "closeContext" extraction event.
+     * The default implementation does nothing.
+     * @param context the context to be closed.
+     * @throws TripleHandlerException if an error occurred while responding to a "closeContext"
+     * extraction event.
+     */
+    @Override
+    public void closeContext(ExtractionContext context) throws TripleHandlerException { }
+
+    /**
+     * Writers may override this method to handle an "endDocument" extraction event.
+     * The default implementation does nothing.
+     * @param documentIRI the document IRI.
+     * @throws TripleHandlerException if an error occurred while responding to a "endDocument"
+     * extraction event.
+     */
+    @Override
+    public void endDocument(IRI documentIRI) throws TripleHandlerException { }
+
+    /**
+     * Writers may override this method to handle a "setContentLength" extraction event.
+     * The default implementation does nothing.
+     * @param contentLength length of the content being processed.
+     */
+    @Override
+    public void setContentLength(long contentLength) { }
+
+}
index 0771fb4..31559c2 100644 (file)
 
 package org.apache.any23.writer;
 
+import org.apache.any23.configuration.Settings;
+import org.eclipse.rdf4j.common.net.ParsedIRI;
+import org.eclipse.rdf4j.rio.RDFWriter;
+import org.eclipse.rdf4j.rio.WriterConfig;
+import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings;
+
 import java.io.OutputStream;
+import java.io.Writer;
+import java.net.URISyntaxException;
 
 /**
- * <i>N3</i> notation writer.
+ * <i>N3</i> notation {@link TripleWriter} implementation.
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class TurtleWriter extends RDFWriterTripleHandler implements FormatWriter {
+public class TurtleWriter extends RDFWriterTripleHandler {
+
+    static class Internal {
+        // rdf4j-internal ArrangedWriter + -ea causes AssertionError
+        // when writing example output of html-mf-hlisting extractor!
+        // Override to return rdf4j TurtleWriter instances instead.
+        private static final org.eclipse.rdf4j.rio.turtle.TurtleWriterFactory rdf4j
+                = new org.eclipse.rdf4j.rio.turtle.TurtleWriterFactory() {
+            @Override
+            public RDFWriter getWriter(OutputStream out) {
+                return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out);
+            }
+            @Override
+            public RDFWriter getWriter(OutputStream out, String baseURI) throws URISyntaxException {
+                return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out, new ParsedIRI(baseURI));
+            }
+            @Override
+            public RDFWriter getWriter(Writer writer) {
+                return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(writer);
+            }
+            @Override
+            public RDFWriter getWriter(Writer writer, String baseURI) throws URISyntaxException {
+                return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(writer, new ParsedIRI(baseURI));
+            }
+        };
+
+        static final TripleFormat FORMAT = format(rdf4j);
+
+        static final Settings SUPPORTED_SETTINGS = Settings.of(
+                WriterSettings.PRETTY_PRINT
+        );
+    }
+
+    @Override
+    void configure(WriterConfig config, Settings settings) {
+        config.set(BasicWriterSettings.PRETTY_PRINT, settings.get(WriterSettings.PRETTY_PRINT));
+    }
 
     /**
      * Constructor.
@@ -30,7 +75,11 @@ public class TurtleWriter extends RDFWriterTripleHandler implements FormatWriter
      * @param out stream to write on.
      */
     public TurtleWriter(OutputStream out) {
-        super(new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out));
+        this(out, Settings.of());
+    }
+
+    public TurtleWriter(OutputStream os, Settings settings) {
+        super(Internal.rdf4j, Internal.FORMAT, os, settings);
     }
 
 }
index 6a04e28..a0db985 100644 (file)
@@ -19,15 +19,15 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
  * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class TurtleWriterFactory implements WriterFactory {
+public class TurtleWriterFactory implements TripleWriterFactory {
 
-    public static final String MIME_TYPE = RDFFormat.TURTLE.getDefaultMIMEType();
+    public static final String MIME_TYPE = TurtleWriter.Internal.FORMAT.getMimeType();
     public static final String IDENTIFIER = "turtle";
 
     /**
@@ -37,23 +37,23 @@ public class TurtleWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        return RDFFormat.TURTLE;
+    public TripleFormat getTripleFormat() {
+        return TurtleWriter.Internal.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return TurtleWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return TurtleWriter.Internal.SUPPORTED_SETTINGS;
     }
 
     @Override
-    public String getMimeType() {
-        return TurtleWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
-        return new TurtleWriter(os);
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
+        return new TurtleWriter(os, settings);
     }
 
 }
index f8faca5..ae3aecf 100644 (file)
 
 package org.apache.any23.writer;
 
-import org.apache.any23.extractor.ExtractionContext;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
 
+import java.io.BufferedWriter;
 import java.io.OutputStream;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.List;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.TreeSet;
 
 /**
  * This writer simply produces a list of unique <i>IRI</i> present in the
  * subject or in the object of every single extracted <i>RDF Statement</i>.
  * 
  * @author Davide Palmisano (palmisano@fbk.eu)
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class URIListWriter implements FormatWriter {
+public class URIListWriter extends TripleWriterHandler implements FormatWriter {
 
-    private List<Resource> resources;
+    private static final Charset charset = StandardCharsets.UTF_8;
 
-    private PrintStream printStream;
+    static final TripleFormat FORMAT = TripleFormat.of("URIList",
+            Collections.singleton(URIListWriterFactory.MIME_TYPE), charset, Collections.singleton("txt"), null,
+            TripleFormat.NONSTANDARD);
 
-    private ExtractionContext extractionContext;
+    private final TreeSet<String> resources = new TreeSet<>();
 
-    private long contentLength;
+    private PrintWriter writer;
 
     public URIListWriter(OutputStream outputStream) {
-        this.resources = new ArrayList<Resource>();
-        this.printStream = new PrintStream(outputStream);
+        writer = new PrintWriter(new BufferedWriter(
+                new OutputStreamWriter(outputStream, charset)));
     }
 
-    public void startDocument(IRI documentIRI) throws TripleHandlerException {}
-
-    public void openContext(ExtractionContext context) throws TripleHandlerException {
-        this.extractionContext = context;
-    }
-
-    public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
+    @Override
+    public void writeTriple(Resource s, IRI p, Value o, Resource g)
             throws TripleHandlerException {
-        if(!this.resources.contains(s)) {
-            this.resources.add(s);
-            this.printStream.println(s.stringValue());
+        String string;
+        if (s instanceof IRI && resources.add(string = s.stringValue())) {
+            writer.println(string);
         }
-        if(o instanceof Resource && !this.resources.contains(o)) {
-            this.resources.add((Resource) o);
-            this.printStream.println(o.stringValue());
+        if (o instanceof IRI && resources.add(string = o.stringValue())) {
+            writer.println(string);
         }
     }
 
-    public void receiveNamespace(String prefix, String uri, ExtractionContext context)
+    @Override
+    public void writeNamespace(String prefix, String uri)
             throws TripleHandlerException {
     }
 
-    public void closeContext(ExtractionContext context) throws TripleHandlerException {
-    }
-
+    @Override
     public void endDocument(IRI documentIRI) throws TripleHandlerException {
+        writer.flush();
     }
 
-    public void setContentLength(long contentLength) {
-        this.contentLength = contentLength;
-    }
-
+    @Override
     public void close() throws TripleHandlerException {
-        this.printStream.close();
+        writer.flush();
+        writer = null;
+        resources.clear();
     }
 
     @Override
@@ -93,4 +92,5 @@ public class URIListWriter implements FormatWriter {
     public void setAnnotated(boolean f) {
         // Empty.
     }
+
 }
index 9defefc..84836aa 100644 (file)
@@ -19,17 +19,18 @@ package org.apache.any23.writer;
 
 import java.io.OutputStream;
 
-import org.eclipse.rdf4j.rio.RDFFormat;
+import org.apache.any23.configuration.Settings;
 
 /**
  * @author Peter Ansell p_ansell@yahoo.com
- * 
+ * @author Hans Brende (hansbrende@apache.org)
  */
-public class URIListWriterFactory implements WriterFactory {
+public class URIListWriterFactory implements TripleWriterFactory {
 
     public static final String MIME_TYPE = "text/plain";
     public static final String IDENTIFIER = "uri";
 
+
     /**
      * 
      */
@@ -37,22 +38,22 @@ public class URIListWriterFactory implements WriterFactory {
     }
 
     @Override
-    public RDFFormat getRdfFormat() {
-        throw new RuntimeException("This writer does not print RDF triples");
+    public TripleFormat getTripleFormat() {
+        return URIListWriter.FORMAT;
     }
 
     @Override
-    public String getIdentifier() {
-        return URIListWriterFactory.IDENTIFIER;
+    public Settings getSupportedSettings() {
+        return Settings.of();
     }
 
     @Override
-    public String getMimeType() {
-        return URIListWriterFactory.MIME_TYPE;
+    public String getIdentifier() {
+        return IDENTIFIER;
     }
 
     @Override
-    public FormatWriter getRdfWriter(OutputStream os) {
+    public TripleHandler getTripleWriter(OutputStream os, Settings settings) {
         return new URIListWriter(os);
     }
 
diff --git a/core/src/main/java/org/apache/any23/writer/WriterSettings.java b/core/src/main/java/org/apache/any23/writer/WriterSettings.java
new file mode 100644 (file)
index 0000000..40e3b26
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.writer;
+
+import org.apache.any23.configuration.Setting;
+
+
+/**
+ *
+ * This class encapsulates commonly supported settings for {@link TripleWriter} implementations.
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+public class WriterSettings {
+    private WriterSettings() {
+        throw new AssertionError();
+    }
+
+    // Keep identifiers short & sweet for ease of user's CLI usage!
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // (Since each WriterFactory must maintain its own pool of "supported settings",
+    //  we don't need to worry about identifiers being globally unique.
+    //  A single identifier could theoretically map to different keys--and
+    //  therefore to different semantics--under different WriterFactory instances.
+    //  Note that it is the *memory-based identity of the key*, not the
+    //  key's textual identifier, that denotes the semantics for a given setting.
+    //  However, since each Settings object is guaranteed to contain only one setting
+    //  per identifier, we can be assured that identifiers will be unique on a
+    //  per-WriterFactory basis.)
+
+    /**
+     * Directive to writer that output should be printed in a way to maximize human readability.
+     */
+    public static final Setting<Boolean> PRETTY_PRINT = Setting.newKey("pretty", Boolean.class)
+            .withValue(Boolean.TRUE);
+
+    /**
+     * Directive to writer that at least the non-ASCII characters should be escaped.
+     */
+    public static final Setting<Boolean> PRINT_ASCII = Setting.newKey("ascii", Boolean.class)
+            .withValue(Boolean.FALSE);
+
+
+}
index b49fd88..c245efb 100644 (file)
@@ -17,6 +17,6 @@
 
 /**
  * This package collects a set of {@link org.apache.any23.writer.TripleHandler}
- * decorators and specific <i>RDF</i> format writers.
+ * decorators and specific <i>RDF</i> format {@link org.apache.any23.writer.TripleWriter} implementations.
  */
 package org.apache.any23.writer;
\ No newline at end of file
index 4c4fffd..0099d9f 100644 (file)
@@ -35,7 +35,7 @@ public class JSONWriterTest {
     @Test
     public void testJSONWriting() throws TripleHandlerException, IOException {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        writeContent(new JSONWriter(baos));
+        writeContentComplicated(new JSONWriter(baos));
 
         final String expected 
             = "{\n"
@@ -64,12 +64,16 @@ public class JSONWriterTest {
             + "  }, null ] ]\n"
             + "}";
         Assert.assertEquals(expected, baos.toString());
+
+        baos.reset();
+        writeContentSimple(new JSONWriter(baos));
+        Assert.assertEquals(expected, baos.toString());
     }
 
     @Test
     public void testJSONLDWriting() throws TripleHandlerException {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        writeContent(new JSONLDWriter(baos));
+        writeContentComplicated(new JSONLDWriter(baos));
         final String expected =
                 "[ {\n" +
                 "  \"@graph\" : [ {\n" +
@@ -99,9 +103,37 @@ public class JSONWriterTest {
                 "  \"@id\" : \"http://graph/2\"\n" +
                 "} ]";
         Assert.assertEquals(expected, baos.toString());
+
+        baos.reset();
+        writeContentSimple(new JSONLDWriter(baos));
+        Assert.assertEquals(expected, baos.toString());
+    }
+
+    private void writeContentSimple(TripleWriter writer) throws TripleHandlerException {
+        writer.writeTriple(SimpleValueFactory.getInstance().createBNode("bn1"),
+                SimpleValueFactory.getInstance().createIRI("http://pred/1"),
+                SimpleValueFactory.getInstance().createIRI("http://value/1"),
+                SimpleValueFactory.getInstance().createIRI("http://graph/1"));
+
+        writer.writeTriple(SimpleValueFactory.getInstance().createIRI("http://sub/2"),
+                SimpleValueFactory.getInstance().createIRI("http://pred/2"),
+                SimpleValueFactory.getInstance().createLiteral("language literal", "en"),
+                SimpleValueFactory.getInstance().createIRI("http://graph/2"));
+
+        writer.writeTriple(
+                SimpleValueFactory.getInstance().createIRI("http://sub/3"),
+                SimpleValueFactory.getInstance().createIRI("http://pred/3"),
+                SimpleValueFactory.getInstance().createLiteral("123",
+                        SimpleValueFactory.getInstance().createIRI("http://datatype")),
+                writer instanceof JSONLDWriter ? SimpleValueFactory.getInstance().createIRI("http://any23.org/tmp/") : null);
+
+        writer.close();
+
     }
 
-    private void writeContent(FormatWriter writer) throws TripleHandlerException {
+    private void writeContentComplicated(TripleHandler writer) throws TripleHandlerException {
+        //creating a fake document uri in order to write triples is terrible.
+        //see improved solution in "writeContentSimple"!
         final IRI documentIRI = SimpleValueFactory.getInstance().createIRI("http://fake/uri");
         writer.startDocument(documentIRI);
         writer.receiveTriple(
@@ -127,6 +159,8 @@ public class JSONWriterTest {
                     null
             );
         } else if (writer instanceof JSONLDWriter) {
+            //creating a fake extraction context in order to write triples is terrible.
+            //see improved solution in "writeContentSimple"!
             ExtractionContext extractionContext = new ExtractionContext("rdf-nq", SimpleValueFactory.getInstance().createIRI("http://any23.org/tmp/"));
             writer.receiveTriple(
                     SimpleValueFactory.getInstance().createIRI("http://sub/3"),
index ec0ccf0..fc0f09a 100644 (file)
@@ -22,11 +22,13 @@ import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+
+import org.apache.any23.configuration.Settings;
 import org.junit.Assert;
 import org.junit.Test;
 
 /**
- * Test case for {@link WriterRegistry}.
+ * Test case for {@link WriterFactoryRegistry}.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
@@ -71,8 +73,16 @@ public class WriterRegistryTest {
     public void testGetWriterInstanceByIdentifier() {
         final List<String> ids = target.getIdentifiers();
         final ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        for(String id : ids) {
-            Assert.assertNotNull( target.getWriterInstanceByIdentifier(id, baos) );
+        final CompositeTripleHandler delegate = new CompositeTripleHandler();
+        for (String id : ids) {
+            WriterFactory f = target.getWriterByIdentifier(id);
+            if (f instanceof TripleWriterFactory) {
+                Assert.assertNotNull(((TripleWriterFactory) f).getTripleWriter(baos, Settings.of()));
+            } else if (f instanceof DecoratingWriterFactory) {
+                Assert.assertNotNull(((DecoratingWriterFactory) f).getTripleWriter(delegate, Settings.of()));
+            } else {
+                Assert.fail(id + " is not a valid writer factory");
+            }
         }
     }
 
index 024bf70..9640b17 100644 (file)
@@ -25,10 +25,12 @@ import java.security.cert.CertificateException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.stream.Collectors;
 import javax.servlet.ServletOutputStream;
 import javax.servlet.http.HttpServletResponse;
 import org.apache.any23.Any23;
 import org.apache.any23.ExtractionReport;
+import org.apache.any23.configuration.Settings;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.Extractor;
@@ -41,6 +43,7 @@ import org.apache.any23.validator.XMLValidationReportSerializer;
 import org.apache.any23.writer.CompositeTripleHandler;
 import org.apache.any23.writer.CountingTripleHandler;
 import org.apache.any23.writer.FormatWriter;
+import org.apache.any23.writer.TripleWriterFactory;
 import org.apache.any23.writer.ReportingTripleHandler;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
@@ -315,19 +318,24 @@ class WebResponder {
 
     private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException {
         final WriterFactory factory = getFormatWriter(format);
-        if (factory == null) {
+        if (!(factory instanceof TripleWriterFactory)) {
             sendError(
                     400,
-                    "Invalid format '" + format + "', try one of: [rdfxml, turtle, ntriples, nquads, trix, json]",
+                    "Invalid format '" + format + "', try one of: "
+                            + writerRegistry.getWriters().stream()
+                            .filter(f -> f instanceof TripleWriterFactory)
+                            .map(WriterFactory::getIdentifier).collect(Collectors.toList()),
                     null,
                     null,
                     report
             );
             return false;
         }
-        FormatWriter fw = factory.getRdfWriter(byteOutStream);
-        fw.setAnnotated(annotate);
-        outputMediaType = factory.getMimeType();
+        TripleHandler fw = ((TripleWriterFactory) factory).getTripleWriter(byteOutStream, Settings.of());
+        if (fw instanceof FormatWriter) {
+            ((FormatWriter)fw).setAnnotated(annotate);
+        }
+        outputMediaType = ((TripleWriterFactory) factory).getTripleFormat().getMimeType();
         List<TripleHandler> tripleHandlers = new ArrayList<>();
         tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
         tripleHandlers.add(new CountingTripleHandler());
diff --git a/test-resources/src/test/resources/cli/basic-with-stylesheet.html b/test-resources/src/test/resources/cli/basic-with-stylesheet.html
new file mode 100644 (file)
index 0000000..6348000
--- /dev/null
@@ -0,0 +1,29 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
+<head>
+    <link rel="stylesheet" href="https://www.data.gov/app/plugins/simple-tooltips/zebra_tooltips.css?ver=4.9.1">
+</head>
+<body>
+<div xmlns:dc="http://purl.org/dc/terms/" xmlns:fake="http://fake.org/">
+    <h2 property="dc:title">The trouble with Bob</h2>
+    <h3 property="dc:creator">Alice</h3>
+    <h3 property="fake:prop">Mary</h3>
+    ...
+</div>
+</body>
+</html>
\ No newline at end of file