9cee7a464ed4997d4e6e75bd708a7b96d50fb6b3
[any23.git] / core / src / main / java / org / apache / any23 / extractor / SingleDocumentExtraction.java
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.configuration.Configuration;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.encoding.EncodingDetector;
23 import org.apache.any23.encoding.TikaEncodingDetector;
24 import org.apache.any23.extractor.html.DocumentReport;
25 import org.apache.any23.extractor.html.HTMLDocument;
26 import org.apache.any23.extractor.html.MicroformatExtractor;
27 import org.apache.any23.extractor.html.TagSoupParser;
28 import org.apache.any23.mime.MIMEType;
29 import org.apache.any23.mime.MIMETypeDetector;
30 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31 import org.apache.any23.rdf.RDFUtils;
32 import org.apache.any23.source.DocumentSource;
33 import org.apache.any23.source.LocalCopyFactory;
34 import org.apache.any23.source.MemCopyFactory;
35 import org.apache.any23.validator.EmptyValidationReport;
36 import org.apache.any23.validator.ValidatorException;
37 import org.apache.any23.vocab.SINDICE;
38 import org.apache.any23.writer.CompositeTripleHandler;
39 import org.apache.any23.writer.CountingTripleHandler;
40 import org.apache.any23.writer.TripleHandler;
41 import org.apache.any23.writer.TripleHandlerException;
42 import org.apache.any23.extractor.Extractor.BlindExtractor;
43 import org.apache.any23.extractor.Extractor.ContentExtractor;
44 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45 import org.eclipse.rdf4j.model.BNode;
46 import org.eclipse.rdf4j.model.IRI;
47 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
48 import org.slf4j.Logger;
49 import org.slf4j.LoggerFactory;
50
51 import java.io.BufferedInputStream;
52 import java.io.ByteArrayOutputStream;
53 import java.io.IOException;
54 import java.io.InputStream;
55 import java.io.PrintStream;
56 import java.net.URISyntaxException;
57 import java.util.ArrayList;
58 import java.util.Collection;
59 import java.util.Collections;
60 import java.util.Date;
61 import java.util.HashMap;
62 import java.util.List;
63 import java.util.Map;
64 import java.util.UUID;
65
66 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
67 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
68
69 /**
70 * This class acts as facade where all the extractors were called on a single document.
71 */
72 public class SingleDocumentExtraction {
73
74 private static final SINDICE vSINDICE = SINDICE.getInstance();
75
76 private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
77
78 private final Configuration configuration;
79
80 private final DocumentSource in;
81
82 private IRI documentIRI;
83
84 private final ExtractorGroup extractors;
85
86 private final TripleHandler output;
87
88 private final EncodingDetector encoderDetector;
89
90 private LocalCopyFactory copyFactory = null;
91
92 private DocumentSource localDocumentSource = null;
93
94 private MIMETypeDetector detector = null;
95
96 private ExtractorGroup matchingExtractors = null;
97
98 private MIMEType detectedMIMEType = null;
99
100 private DocumentReport documentReport = null;
101
102 private ExtractionParameters tagSoupDOMRelatedParameters = null;
103
104 private String parserEncoding = null;
105
106 /**
107 * Builds an extractor by the specification of document source,
108 * list of extractors and output triple handler.
109 *
110 * @param configuration configuration applied during extraction.
111 * @param in input document source.
112 * @param extractors list of extractors to be applied.
113 * @param output output triple handler.
114 */
115 public SingleDocumentExtraction(
116 Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
117 ) {
118 if(configuration == null)
119 throw new NullPointerException("configuration cannot be null.");
120 if(in == null)
121 throw new NullPointerException("in cannot be null.");
122 this.configuration = configuration;
123 this.in = in;
124 this.extractors = extractors;
125
126 List<TripleHandler> tripleHandlers = new ArrayList<>();
127 tripleHandlers.add(output);
128 tripleHandlers.add(new CountingTripleHandler());
129 this.output = new CompositeTripleHandler(tripleHandlers);
130 this.encoderDetector = new TikaEncodingDetector();
131 }
132
133 /**
134 * Builds an extractor by the specification of document source,
135 * extractors factory and output triple handler.
136 *
137 * @param configuration configuration applied during extraction.
138 * @param in input document source.
139 * @param factory the extractors factory.
140 * @param output output triple handler.
141 */
142 public SingleDocumentExtraction(
143 Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
144 ) {
145 this(
146 configuration,
147 in,
148 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
149 output
150 );
151 this.setMIMETypeDetector(null);
152 }
153
154 /**
155 * Builds an extractor by the specification of document source,
156 * extractors factory and output triple handler, using the
157 * {@link org.apache.any23.configuration.DefaultConfiguration}.
158 *
159 * @param in input document source.
160 * @param factory the extractors factory.
161 * @param output output triple handler.
162 */
163 public SingleDocumentExtraction(
164 DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
165 ) {
166 this(
167 DefaultConfiguration.singleton(),
168 in,
169 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
170 output
171 );
172 this.setMIMETypeDetector(null);
173 }
174
175 /**
176 * Sets the internal factory for generating the document local copy,
177 * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
178 *
179 * @param copyFactory local copy factory.
180 * @see org.apache.any23.source.DocumentSource
181 */
182 public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
183 this.copyFactory = copyFactory;
184 }
185
186 /**
187 * Sets the internal mime type detector,
188 * if <code>null</code> mimetype detection will
189 * be skipped and all extractors will be activated.
190 *
191 * @param detector detector instance.
192 */
193 public void setMIMETypeDetector(MIMETypeDetector detector) {
194 this.detector = detector;
195 }
196
197 /**
198 * Triggers the execution of all the {@link Extractor}
199 * registered to this class using the specified extraction parameters.
200 *
201 * @param extractionParameters the parameters applied to the run execution.
202 * @return the report generated by the extraction.
203 * @throws ExtractionException if an error occurred during the data extraction.
204 * @throws IOException if an error occurred during the data access.
205 */
206 public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
207 throws ExtractionException, IOException {
208 if(extractionParameters == null) {
209 extractionParameters = ExtractionParameters.newDefault(configuration);
210 }
211
212 final String contextIRI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
213 ensureHasLocalCopy();
214 try {
215 this.documentIRI = new Any23ValueFactoryWrapper(
216 SimpleValueFactory.getInstance()
217 ).createIRI( "?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
218 } catch (Exception ex) {
219 throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
220 }
221 if(log.isInfoEnabled()) {
222 log.info("Processing " + this.documentIRI);
223 }
224 filterExtractorsByMIMEType();
225
226 if(log.isDebugEnabled()) {
227 StringBuilder sb = new StringBuilder("Extractors ");
228 for (ExtractorFactory<?> factory : matchingExtractors) {
229 sb.append(factory.getExtractorName());
230 sb.append(' ');
231 }
232 sb.append("match ").append(documentIRI);
233 log.debug(sb.toString());
234 }
235
236 final List<ResourceRoot> resourceRoots = new ArrayList<>();
237 final List<PropertyPath> propertyPaths = new ArrayList<>();
238 final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
239 new HashMap<>();
240
241 // Invoke all extractors.
242 try {
243 output.startDocument(documentIRI);
244 } catch (TripleHandlerException e) {
245 log.error(String.format("Error starting document with IRI %s", documentIRI));
246 throw new ExtractionException(String.format("Error starting document with IRI %s", documentIRI),
247 e
248 );
249 }
250 try {
251 output.setContentLength(in.getContentLength());
252 // Create the document context.
253 try {
254 final String documentLanguage = extractDocumentLanguage(extractionParameters);
255 for (ExtractorFactory<?> factory : matchingExtractors) {
256 @SuppressWarnings("rawtypes")
257 final Extractor extractor = factory.createExtractor();
258 final SingleExtractionReport er = runExtractor(
259 extractionParameters,
260 documentLanguage,
261 extractor
262 );
263 resourceRoots.addAll( er.resourceRoots );
264 propertyPaths.addAll( er.propertyPaths );
265 extractorToIssues.put(factory.getExtractorName(), er.issues);
266 }
267 } catch(ValidatorException ve) {
268 throw new ExtractionException("An error occurred during the validation phase.", ve);
269 }
270
271 // Resource consolidation.
272 final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
273 final ExtractionContext consolidationContext;
274 if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
275 // Consolidation with nesting.
276 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output);
277 } else {
278 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output);
279 }
280
281 // Adding time/size meta triples.
282 if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
283 try {
284 addExtractionTimeSizeMetaTriples(consolidationContext);
285 } catch (TripleHandlerException e) {
286 throw new ExtractionException(
287 String.format(
288 "Error while adding extraction metadata triples document with IRI %s", documentIRI
289 ),
290 e
291 );
292 }
293 }
294 } finally {
295 try {
296 output.endDocument(documentIRI);
297 } catch (TripleHandlerException e) {
298 log.error(String.format("Error ending document with IRI %s", documentIRI));
299 throw new ExtractionException(String.format("Error ending document with IRI %s", documentIRI),
300 e
301 );
302 }
303 }
304
305 return new SingleDocumentExtractionReport(
306 documentReport == null
307 ?
308 EmptyValidationReport.getInstance() : documentReport.getReport(),
309 extractorToIssues
310 );
311 }
312
313 /**
314 * Triggers the execution of all the {@link Extractor}
315 * registered to this class using the <i>default</i> extraction parameters.
316 *
317 * @throws IOException if there is an error reading input from the document source
318 * @throws ExtractionException if there is an error duing distraction
319 * @return the extraction report.
320 */
321 public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
322 return run(ExtractionParameters.newDefault(configuration));
323 }
324
325 /**
326 * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
327 *
328 * @return string containing the detected mimetype.
329 * @throws IOException if an error occurred while accessing the data.
330 */
331 public String getDetectedMIMEType() throws IOException {
332 filterExtractorsByMIMEType();
333 return detectedMIMEType == null ? null : detectedMIMEType.toString();
334 }
335
336 /**
337 * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
338 *
339 * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
340 * @throws IOException if there is an error locating matching extractors
341 */
342 public boolean hasMatchingExtractors() throws IOException {
343 filterExtractorsByMIMEType();
344 return !matchingExtractors.isEmpty();
345 }
346
347 /**
348 * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
349 */
350 @SuppressWarnings("rawtypes")
351 public List<Extractor> getMatchingExtractors() {
352 final List<Extractor> extractorsList = new ArrayList<>();
353 for(ExtractorFactory extractorFactory : matchingExtractors) {
354 extractorsList.add( extractorFactory.createExtractor() );
355 }
356 return extractorsList;
357 }
358
359 /**
360 * @return the configured parsing encoding.
361 */
362 public String getParserEncoding() {
363 if(this.parserEncoding == null) {
364 this.parserEncoding = detectEncoding();
365 }
366 return this.parserEncoding;
367 }
368
369 /**
370 * Sets the document parser encoding.
371 *
372 * @param encoding parser encoding.
373 */
374 public void setParserEncoding(String encoding) {
375 this.parserEncoding = encoding;
376 documentReport = null;
377 }
378
379 /**
380 * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
381 *
382 * @return <code>true</code> if the document source is an HTML document.
383 * @throws IOException if an error occurs while accessing data.
384 */
385 private boolean isHTMLDocument() throws IOException {
386 filterExtractorsByMIMEType();
387 return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
388 }
389
390 /**
391 * Extracts the document language where possible.
392 *
393 * @param extractionParameters extraction parameters to be applied to determine the document language.
394 * @return the document language if any, <code>null</code> otherwise.
395 * @throws java.io.IOException if an error occurs during the document analysis.
396 * @throws org.apache.any23.validator.ValidatorException
397 */
398 private String extractDocumentLanguage(ExtractionParameters extractionParameters)
399 throws IOException, ValidatorException {
400 if( ! isHTMLDocument() ) {
401 return null;
402 }
403 final HTMLDocument document;
404 try {
405 document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
406 } catch (IOException ioe) {
407 log.debug("Cannot extract language from document.", ioe);
408 return null;
409 }
410 return document.getDefaultLanguage();
411 }
412
413 /**
414 * Generates a list of extractors that can be applied to the given document.
415 *
416 * @throws IOException
417 */
418 private void filterExtractorsByMIMEType()
419 throws IOException {
420 if (matchingExtractors != null)
421 return; // has already been run.
422
423 if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
424 matchingExtractors = extractors;
425 return;
426 }
427 ensureHasLocalCopy();
428 // detect MIME based on the real file IRI rather than based on given base namespace
429 detectedMIMEType = detector.guessMIMEType(
430 java.net.URI.create(in.getDocumentIRI()).getPath(),
431 localDocumentSource.openInputStream(),
432 MIMEType.parse(localDocumentSource.getContentType())
433 );
434 log.debug("detected media type: " + detectedMIMEType);
435 matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
436 }
437
438 /**
439 * Triggers the execution of a specific {@link Extractor}.
440 *
441 * @param extractionParameters the parameters used for the extraction.
442 * @param extractor the {@link Extractor} to be executed.
443 * @throws ExtractionException if an error specific to an extractor happens.
444 * @throws IOException if an IO error occurs during the extraction.
445 * @return the roots of the resources that have been extracted.
446 * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
447 */
448 private SingleExtractionReport runExtractor(
449 final ExtractionParameters extractionParameters,
450 final String documentLanguage,
451 final Extractor<?> extractor
452 ) throws ExtractionException, IOException, ValidatorException {
453 if(log.isDebugEnabled()) {
454 log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
455 }
456 long startTime = System.currentTimeMillis();
457 final ExtractionContext extractionContext = new ExtractionContext(
458 extractor.getDescription().getExtractorName(),
459 documentIRI,
460 documentLanguage
461 );
462 final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
463 try {
464 if (extractor instanceof BlindExtractor) {
465 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
466 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
467 } else if (extractor instanceof ContentExtractor) {
468 ensureHasLocalCopy();
469 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
470 contentExtractor.run(
471 extractionParameters,
472 extractionContext,
473 localDocumentSource.openInputStream(),
474 extractionResult
475 );
476 } else if (extractor instanceof TagSoupDOMExtractor) {
477 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
478 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
479 tagSoupDOMExtractor.run(
480 extractionParameters,
481 extractionContext,
482 documentReport.getDocument(),
483 extractionResult
484 );
485 } else {
486 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
487 }
488 return
489 new SingleExtractionReport(
490 extractionResult.getIssues(),
491 new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
492 new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
493 );
494 } catch (ExtractionException ex) {
495 if(log.isDebugEnabled()) {
496 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
497 }
498 throw ex;
499 } finally {
500 // Logging result error report.
501 if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
502 ByteArrayOutputStream baos = new ByteArrayOutputStream();
503 extractionResult.printReport(new PrintStream(baos));
504 log.debug(baos.toString());
505 }
506 extractionResult.close();
507
508 long elapsed = System.currentTimeMillis() - startTime;
509 if(log.isDebugEnabled()) {
510 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
511 }
512 }
513 }
514
515 /**
516 * Forces the retrieval of the document data.
517 *
518 * @throws IOException
519 */
520 private void ensureHasLocalCopy() throws IOException {
521 if (localDocumentSource != null)
522 return;
523 if (in.isLocal()) {
524 localDocumentSource = in;
525 return;
526 }
527 if (copyFactory == null) {
528 copyFactory = new MemCopyFactory();
529 }
530 localDocumentSource = copyFactory.createLocalCopy(in);
531 }
532
533 /**
534 * Returns the DOM of the given document source (that must be an HTML stream)
535 * and the report of eventual fixes applied on it.
536 *
537 * @param extractionParameters parameters to be used during extraction.
538 * @return document report.
539 * @throws IOException if an error occurs during data access.
540 * @throws ValidatorException if an error occurs during validation.
541 */
542 private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
543 throws IOException, ValidatorException {
544 if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
545 ensureHasLocalCopy();
546 final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
547 is.mark(Integer.MAX_VALUE);
548 final String candidateEncoding = getParserEncoding();
549 is.reset();
550 final TagSoupParser tagSoupParser = new TagSoupParser(
551 is,
552 documentIRI.stringValue(),
553 candidateEncoding
554 );
555 if(extractionParameters.isValidate()) {
556 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
557 } else {
558 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
559 }
560 tagSoupDOMRelatedParameters = extractionParameters;
561 }
562 return documentReport;
563 }
564
565 /**
566 * Detects the encoding of the local document source input stream.
567 *
568 * @return a valid encoding value.
569 */
570 private String detectEncoding() {
571 try {
572 ensureHasLocalCopy();
573 InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
574 String encoding = this.encoderDetector.guessEncoding(is);
575 is.close();
576 return encoding;
577 } catch (Exception e) {
578 throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
579 }
580 }
581
582 /**
583 * This function verifies if the <i>candidateSub</i> list of strings
584 * is a prefix of <i>list</i>.
585 *
586 * @param list a list of strings.
587 * @param candidateSub a list of strings.
588 * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
589 * <code>false</code> otherwise.
590 */
591 private boolean subPath(String[] list, String[] candidateSub) {
592 if(candidateSub.length > list.length) {
593 return false;
594 }
595 for(int i = 0; i < candidateSub.length; i++) {
596 if( ! candidateSub[i].equals(list[i])) {
597 return false;
598 }
599 }
600 return true;
601 }
602
603 /**
604 * Adds for every resource root node a page domain triple.
605 *
606 * @param resourceRoots list of resource roots.
607 * @param context extraction context to produce triples.
608 * @throws ExtractionException
609 */
610 private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
611 throws ExtractionException {
612 try {
613 // Add source Web domains to every resource root.
614 String domain;
615 try {
616 domain = new java.net.URI(in.getDocumentIRI()).getHost();
617 } catch (URISyntaxException urise) {
618 throw new IllegalArgumentException(
619 "An error occurred while extracting the host from the document IRI.",
620 urise
621 );
622 }
623 if (domain != null) {
624 for (ResourceRoot resourceRoot : resourceRoots) {
625 output.receiveTriple(
626 resourceRoot.getRoot(),
627 vSINDICE.getProperty(SINDICE.DOMAIN),
628 SimpleValueFactory.getInstance().createLiteral(domain),
629 null,
630 context
631 );
632 }
633 }
634 } catch (TripleHandlerException e) {
635 throw new ExtractionException("Error while writing triple triple.", e);
636 } finally {
637 try {
638 output.closeContext(context);
639 } catch (TripleHandlerException e) {
640 throw new ExtractionException("Error while closing context.", e);
641 }
642 }
643 }
644
645 /**
646 * @return an extraction context specific for consolidation triples.
647 */
648 private ExtractionContext createExtractionContext() {
649 return new ExtractionContext(
650 "consolidation-extractor",
651 documentIRI,
652 UUID.randomUUID().toString()
653 );
654 }
655
656 /**
657 * Detect the nesting relationship among different
658 * Microformats and explicit them adding connection triples.
659 *
660 * @param resourceRoots
661 * @param propertyPaths
662 * @param context
663 * @throws TripleHandlerException
664 */
665 private void addNestingRelationship(
666 List<ResourceRoot> resourceRoots,
667 List<PropertyPath> propertyPaths,
668 ExtractionContext context
669 ) throws TripleHandlerException {
670 ResourceRoot currentResourceRoot;
671 PropertyPath currentPropertyPath;
672 for (int r = 0; r < resourceRoots.size(); r++) {
673 currentResourceRoot = resourceRoots.get(r);
674 for (int p = 0; p < propertyPaths.size(); p++) {
675 currentPropertyPath = propertyPaths.get(p);
676 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
677 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
678 // Avoid wrong nesting relationships.
679 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
680 continue;
681 }
682 // Avoid self declaring relationships
683 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
684 continue;
685 }
686 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
687 createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
688 }
689 }
690 }
691 }
692
693 /**
694 * This method consolidates the graphs extracted from the same document.
695 * In particular it adds:
696 * <ul>
697 * <li>for every microformat root node a triple indicating the original Web page domain;</li>
698 * <li>triples indicating the nesting relationship among a microformat root and property paths of
699 * other nested microformats.
700 * </li>
701 * </ul>
702 * @param resourceRoots list of RDF nodes representing roots of
703 * extracted microformat graphs and the corresponding HTML paths.
704 * @param propertyPaths list of RDF nodes representing property subjects, property IRIs and the HTML paths
705 * from which such properties have been extracted.
706 * @param addDomainTriples
707 * @param output a triple handler event collector.
708 * @return
709 * @throws ExtractionException
710 */
711 private ExtractionContext consolidateResources(
712 List<ResourceRoot> resourceRoots,
713 List<PropertyPath> propertyPaths,
714 boolean addDomainTriples,
715 TripleHandler output
716 ) throws ExtractionException {
717 final ExtractionContext context = createExtractionContext();
718
719 try {
720 output.openContext(context);
721 } catch (TripleHandlerException e) {
722 throw new ExtractionException(
723 String.format("Error starting document with IRI %s", documentIRI),
724 e
725 );
726 }
727
728 try {
729 if(addDomainTriples) {
730 addDomainTriplesPerResourceRoots(resourceRoots, context);
731 }
732 addNestingRelationship(resourceRoots, propertyPaths, context);
733 } catch (TripleHandlerException the) {
734 throw new ExtractionException("Error while writing triple triple.", the);
735 } finally {
736 try {
737 output.closeContext(context);
738 } catch (TripleHandlerException e) {
739 throw new ExtractionException("Error while closing context.", e);
740 }
741 }
742
743 return context;
744 }
745
746 /**
747 * This method consolidates the graphs extracted from the same document.
748 * In particular it adds:
749 * <ul>
750 * <li>for every microformat root node a triple indicating the original Web page domain;</li>
751 * </ul>
752 * @param resourceRoots list of RDF nodes representing roots of
753 * extracted microformat graphs and the corresponding HTML paths.
754 * from which such properties have been extracted.
755 * @param addDomainTriples
756 * @param output a triple handler event collector.
757 * @return
758 * @throws ExtractionException
759 */
760 private ExtractionContext consolidateResources(
761 List<ResourceRoot> resourceRoots,
762 boolean addDomainTriples,
763 TripleHandler output
764 ) throws ExtractionException {
765 final ExtractionContext context = createExtractionContext();
766
767 try {
768 output.openContext(context);
769 } catch (TripleHandlerException e) {
770 throw new ExtractionException(
771 String.format("Error starting document with IRI %s", documentIRI),
772 e
773 );
774 }
775
776 try {
777 if(addDomainTriples) {
778 addDomainTriplesPerResourceRoots(resourceRoots, context);
779 }
780 } finally {
781 try {
782 output.closeContext(context);
783 } catch (TripleHandlerException the) {
784 throw new ExtractionException("Error while closing context.", the);
785 }
786 }
787
788 return context;
789 }
790
791 /**
792 * Adds metadata triples containing the number of extracted triples
793 * and the extraction timestamp.
794 *
795 * @param context
796 * @throws TripleHandlerException
797 */
798 private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
799 throws TripleHandlerException {
800 // adding extraction date
801 String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
802 output.receiveTriple(
803 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
804 vSINDICE.getProperty(SINDICE.DATE),
805 SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
806 null,
807 context
808 );
809
810 // adding number of extracted triples
811 int numberOfTriples = 0;
812 CompositeTripleHandler cth = (CompositeTripleHandler) output;
813 for (TripleHandler th : cth.getChilds()) {
814 if (th instanceof CountingTripleHandler) {
815 numberOfTriples = ((CountingTripleHandler) th).getCount();
816 }
817 }
818 output.receiveTriple(
819 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
820 vSINDICE.getProperty(SINDICE.SIZE),
821 SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
822 null,
823 context
824 );
825 }
826
827 /**
828 * Creates a nesting relationship triple.
829 *
830 * @param from the property containing the nested microformat.
831 * @param to the root to the nested microformat.
832 * @param th the triple handler.
833 * @param ec the extraction context used to add such information.
834 * @throws org.apache.any23.writer.TripleHandlerException
835 */
836 private void createNestingRelationship(
837 PropertyPath from,
838 ResourceRoot to,
839 TripleHandler th,
840 ExtractionContext ec
841 ) throws TripleHandlerException {
842 final BNode fromObject = from.getObject();
843 final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
844 BNode bnode = RDFUtils.getBNode(bNodeHash);
845 th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
846 th.receiveTriple(
847 bnode,
848 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
849 from.getObject() == null ? to.getRoot() : from.getObject(),
850 null,
851 ec
852 );
853 th.receiveTriple(
854 from.getSubject(),
855 vSINDICE.getProperty(SINDICE.NESTING),
856 bnode,
857 null,
858 ec
859 );
860 }
861
862 /**
863 * Entity detection report.
864 */
865 private class SingleExtractionReport {
866 private final Collection<IssueReport.Issue> issues;
867 private final List<ResourceRoot> resourceRoots;
868 private final List<PropertyPath> propertyPaths;
869
870 public SingleExtractionReport(
871 Collection<IssueReport.Issue> issues,
872 List<ResourceRoot> resourceRoots,
873 List<PropertyPath> propertyPaths
874 ) {
875 this.issues = issues;
876 this.resourceRoots = resourceRoots;
877 this.propertyPaths = propertyPaths;
878 }
879 }
880
881 }