ANY23-341 Remove dependency on defunct commons-httpclient
authorHans <firedrake93@gmail.com>
Tue, 3 Apr 2018 18:33:20 +0000 (13:33 -0500)
committerHans <firedrake93@gmail.com>
Wed, 4 Apr 2018 21:37:16 +0000 (16:37 -0500)
cli/pom.xml
core/pom.xml
core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java
core/src/main/java/org/apache/any23/http/DefaultHTTPClient.java
core/src/main/java/org/apache/any23/http/HTTPClient.java
core/src/main/java/org/apache/any23/source/HTTPDocumentSource.java
core/src/main/java/org/apache/any23/util/LogUtils.java
pom.xml
service/src/main/java/org/apache/any23/servlet/Servlet.java

index 07b7e6b..321b150 100644 (file)
       <artifactId>commons-lang</artifactId>
     </dependency>
     <dependency>
-      <groupId>commons-httpclient</groupId>
-      <artifactId>commons-httpclient</artifactId>
-    </dependency>
-    <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
     </dependency>
index 6fd2550..58a37ee 100644 (file)
       <artifactId>commons-lang</artifactId>
     </dependency>
     <dependency>
-      <groupId>commons-httpclient</groupId>
-      <artifactId>commons-httpclient</artifactId>
-    </dependency>
-    <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
     </dependency>
index bb958c7..188e0f1 100644 (file)
@@ -24,6 +24,7 @@ import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -375,15 +376,32 @@ public class HTMLDocument {
 
     private java.net.URI getBaseIRI() throws ExtractionException {
         if (baseIRI == null) {
+            // document.getBaseURI() returns null for document URIs with
+            // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
+            // It also does *not* take html "base" elements into account.
+            // (But it does take into account urls specified by the attribute "xml:base".)
+
+            // So, for now, let's use getDocumentURI() instead.
+            // TODO: Make this approach better.
+
+            Document doc = document instanceof Document ? (Document)document : document.getOwnerDocument();
+
+            if (doc == null) {
+                throw new ExtractionException("Node " + document.getNodeName() + " was not associated with a document.");
+            }
+
+            String uri = doc.getDocumentURI();
+
+            if (uri == null) {
+                throw new ExtractionException("document URI is null, this should not happen");
+            }
+
             try {
-                if (document.getBaseURI() == null) {
-                    log.warn("document.getBaseURI() is null, this should not happen");
-                }
-                baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(document.getBaseURI()));
+                baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
             } catch (IllegalArgumentException ex) {
-                throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex);
+                throw new ExtractionException("Error in base IRI: " + uri, ex);
             } catch (URISyntaxException ex) {
-                throw new ExtractionException("Error in base IRI: " + document.getBaseURI(), ex);
+                throw new ExtractionException("Error in base IRI: " + uri, ex);
             }
         }
         return baseIRI;
index d520441..2615585 100644 (file)
 
 package org.apache.any23.http;
 
-import org.apache.commons.httpclient.*;
-import org.apache.commons.httpclient.methods.GetMethod;
-import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+import org.apache.commons.io.IOUtils;
+import org.apache.http.Header;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.protocol.HttpClientContext;
+import org.apache.http.config.SocketConfig;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.message.BasicHeader;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.regex.Pattern;
 
 /**
  * Opens an {@link InputStream} on an HTTP IRI. Is configured
@@ -37,9 +45,7 @@ import java.util.regex.Pattern;
  */
 public class DefaultHTTPClient implements HTTPClient {
 
-    private static final Pattern ESCAPED_PATTERN = Pattern.compile("%[0-9a-f]{2}",Pattern.CASE_INSENSITIVE);
-
-    private final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
+    private final PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
 
     private HTTPClientConfiguration configuration;
 
@@ -51,9 +57,6 @@ public class DefaultHTTPClient implements HTTPClient {
 
     private String contentType = null;
 
-    public static final boolean isUrlEncoded(String url) {
-        return ESCAPED_PATTERN.matcher(url).find();
-    }
 
     /**
      * Creates a {@link DefaultHTTPClient} instance already initialized
@@ -82,35 +85,31 @@ public class DefaultHTTPClient implements HTTPClient {
      * located at the URI.
      */
     public InputStream openInputStream(String uri) throws IOException {
-        GetMethod method = null;
+        HttpGet method = null;
         try {
             ensureClientInitialized();
-            String uriStr;
-            try {
-                URI uriObj = new URI(uri, isUrlEncoded(uri));
-                // [scheme:][//authority][path][?query][#fragment]
-                uriStr = uriObj.toString();
-            } catch (URIException e) {
-                throw new IllegalArgumentException("Invalid IRI string.", e);
-            }
-            method = new GetMethod(uriStr);
-            method.setFollowRedirects(true);
-            client.executeMethod(method);
-            _contentLength = method.getResponseContentLength();
-            final Header contentTypeHeader = method.getResponseHeader("Content-Type");
+            HttpClientContext context = HttpClientContext.create();
+            method = new HttpGet(uri);
+            HttpResponse response = client.execute(method, context);
+            List<URI> locations = context.getRedirectLocations();
+
+            URI actualURI = locations == null || locations.isEmpty() ? method.getURI() : locations.get(locations.size() - 1);
+            actualDocumentIRI = actualURI.toString();
+
+            final Header contentTypeHeader = response.getFirstHeader("Content-Type");
             contentType = contentTypeHeader == null ? null : contentTypeHeader.getValue();
-            if (method.getStatusCode() != 200) {
+            if (response.getStatusLine().getStatusCode() != 200) {
                 throw new IOException(
-                        "Failed to fetch " + uri + ": " + method.getStatusCode() + " " + method.getStatusText()
+                        "Failed to fetch " + uri + ": " + response.getStatusLine().getStatusCode() + " " + response.getStatusLine().getReasonPhrase()
                 );
             }
-            actualDocumentIRI = method.getURI().toString();
-            byte[] response = method.getResponseBody();
 
-            return new ByteArrayInputStream(response);
+            byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
+            _contentLength = bytes.length;
+            return new ByteArrayInputStream(bytes);
         } finally {
             if (method != null) {
-                method.releaseConnection();
+                method.reset();
             }
         }
     }
@@ -143,25 +142,38 @@ public class DefaultHTTPClient implements HTTPClient {
     }
 
     private void ensureClientInitialized() {
-        if(configuration == null) throw new IllegalStateException("client must be initialized first.");
-        if (client != null) return;
-        client = new HttpClient(manager);
-        HttpConnectionManager connectionManager = client.getHttpConnectionManager();
-        HttpConnectionManagerParams params = connectionManager.getParams();
-        params.setConnectionTimeout(configuration.getDefaultTimeout());
-        params.setSoTimeout(configuration.getDefaultTimeout());
-        params.setMaxTotalConnections(configuration.getMaxConnections());
-
-        HostConfiguration hostConf = client.getHostConfiguration();
-        List<Header> headers = new ArrayList<Header>();
-        headers.add(new Header("User-Agent", configuration.getUserAgent()));
+        if (configuration == null)
+            throw new IllegalStateException("client must be initialized first.");
+        if (client != null)
+            return;
+
+        RequestConfig requestConfig = RequestConfig.custom()
+                .setConnectTimeout(getConnectionTimeout())
+                .setSocketTimeout(getSoTimeout())
+                .setRedirectsEnabled(true)
+                .build();
+
+        SocketConfig socketConfig = SocketConfig.custom()
+                .setSoTimeout(getSoTimeout())
+                .build();
+
+        List<Header> headers = new ArrayList<>();
+        headers.add(new BasicHeader("User-Agent", configuration.getUserAgent()));
         if (configuration.getAcceptHeader() != null) {
-            headers.add(new Header("Accept", configuration.getAcceptHeader()));
+            headers.add(new BasicHeader("Accept", configuration.getAcceptHeader()));
         }
-        headers.add(new Header("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric.
-        headers.add(new Header("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
-        // headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
-        hostConf.getParams().setParameter("http.default-headers", headers);
+        headers.add(new BasicHeader("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric.
+        // headers.add(new BasicHeader("Accept-Encoding", "x-gzip, gzip"));
+        headers.add(new BasicHeader("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
+
+
+        client = HttpClients.custom()
+                .setConnectionManager(manager)
+                .setDefaultRequestConfig(requestConfig)
+                .setDefaultSocketConfig(socketConfig)
+                .setMaxConnTotal(configuration.getMaxConnections())
+                .setDefaultHeaders(headers)
+                .build();
     }
 
 }
\ No newline at end of file
index 0bc4dbc..3f08975 100644 (file)
@@ -33,7 +33,7 @@ public interface HTTPClient {
      *
      * @param configuration configuration for the HTTP Client.
      */
-    public abstract void init(HTTPClientConfiguration configuration);
+    void init(HTTPClientConfiguration configuration);
 
     /**
      * Opens the input stream for the given target IRI.
@@ -42,7 +42,7 @@ public interface HTTPClient {
      * @return input stream to access IRI content.
      * @throws IOException if any error occurs while reading the IRI content.
      */
-    public abstract InputStream openInputStream(String uri) throws IOException;
+    InputStream openInputStream(String uri) throws IOException;
 
     /**
      * Release all static resources help by the instance. Call this
@@ -50,7 +50,7 @@ public interface HTTPClient {
      * application, like for example when shutting down a servlet
      * context.
      */
-    public abstract void close();
+    void close();
 
     /**
      * The value of the Content-Type header reported by the server.
@@ -58,12 +58,12 @@ public interface HTTPClient {
      *
      * @return the content type as string.
      */
-    public abstract String getContentType();
+    String getContentType();
 
     /**
      * @return content length in bytes.
      */
-    public abstract long getContentLength();
+    long getContentLength();
 
     /**
      * Returns the actual IRI from which the document was fetched.
@@ -73,6 +73,6 @@ public interface HTTPClient {
      *
      * @return actual document IRI.
      */
-    public abstract String getActualDocumentIRI();
+    String getActualDocumentIRI();
     
 }
\ No newline at end of file
index fef124d..e9cebee 100644 (file)
 
 package org.apache.any23.source;
 
-import org.apache.any23.http.DefaultHTTPClient;
 import org.apache.any23.http.HTTPClient;
-import org.apache.commons.httpclient.URI;
-import org.apache.commons.httpclient.URIException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URI;
 import java.net.URISyntaxException;
 
 /**
@@ -50,13 +48,12 @@ public class HTTPDocumentSource implements DocumentSource {
 
     private String normalize(String uri) throws URISyntaxException {
         try {
-            URI normalized = new URI(uri, DefaultHTTPClient.isUrlEncoded(uri));
-            normalized.normalize();
+            URI normalized = new URI(uri).normalize();
             return normalized.toString();
-        } catch (URIException e) {
+        } catch (URISyntaxException e) {
             LOG.warn("Invalid uri: {}", uri);
             LOG.error("Can not convert URL", e);
-            throw new URISyntaxException(uri, e.getMessage());
+            throw e;
         }
     }
 
index ef43c20..30b24ca 100644 (file)
@@ -27,8 +27,6 @@ public class LogUtils {
 
     public static void setDefaultLogging() {
         Logger.getLogger("").setLevel(Level.WARNING);
-        // Suppress silly cookie warnings.
-        Logger.getLogger("org.apache.commons.httpclient").setLevel(Level.SEVERE);
         Logger.getLogger("").getHandlers()[0].setLevel(Level.ALL);
     }
 
diff --git a/pom.xml b/pom.xml
index 4a62dd1..f0f809d 100644 (file)
--- a/pom.xml
+++ b/pom.xml
         <version>2.6</version>
       </dependency>
       <dependency>
-        <groupId>commons-httpclient</groupId>
-        <artifactId>commons-httpclient</artifactId>
-        <version>3.1</version>
-      </dependency>
-      <dependency>
         <groupId>org.apache.httpcomponents</groupId>
         <artifactId>httpclient</artifactId>
         <version>${httpclient.version}</version>
index 154f41d..ad7c1ed 100644 (file)
@@ -29,7 +29,6 @@ import org.apache.any23.source.ByteArrayDocumentSource;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.source.HTTPDocumentSource;
 import org.apache.any23.source.StringDocumentSource;
-import org.apache.commons.httpclient.URI;
 import org.eclipse.rdf4j.rio.RDFFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -41,6 +40,7 @@ import javax.servlet.http.HttpServletResponse;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.regex.Pattern;
 
@@ -286,7 +286,7 @@ public class Servlet extends HttpServlet {
 
     private boolean isValidIRI(String s) {
         try {
-            URI uri = new URI(s, false);
+            URI uri = new URI(s);
             if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
                 return false;
             }