ANY23-334 fixed: default language was a UUID
[any23.git] / core / src / main / java / org / apache / any23 / extractor / SingleDocumentExtraction.java
1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.configuration.Configuration;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.encoding.EncodingDetector;
23 import org.apache.any23.encoding.TikaEncodingDetector;
24 import org.apache.any23.extractor.html.DocumentReport;
25 import org.apache.any23.extractor.html.HTMLDocument;
26 import org.apache.any23.extractor.html.MicroformatExtractor;
27 import org.apache.any23.extractor.html.TagSoupParser;
28 import org.apache.any23.mime.MIMEType;
29 import org.apache.any23.mime.MIMETypeDetector;
30 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31 import org.apache.any23.rdf.RDFUtils;
32 import org.apache.any23.source.DocumentSource;
33 import org.apache.any23.source.LocalCopyFactory;
34 import org.apache.any23.source.MemCopyFactory;
35 import org.apache.any23.validator.EmptyValidationReport;
36 import org.apache.any23.validator.ValidatorException;
37 import org.apache.any23.vocab.SINDICE;
38 import org.apache.any23.writer.CompositeTripleHandler;
39 import org.apache.any23.writer.CountingTripleHandler;
40 import org.apache.any23.writer.TripleHandler;
41 import org.apache.any23.writer.TripleHandlerException;
42 import org.apache.any23.extractor.Extractor.BlindExtractor;
43 import org.apache.any23.extractor.Extractor.ContentExtractor;
44 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45 import org.eclipse.rdf4j.model.BNode;
46 import org.eclipse.rdf4j.model.IRI;
47 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
48 import org.slf4j.Logger;
49 import org.slf4j.LoggerFactory;
50
51 import java.io.BufferedInputStream;
52 import java.io.ByteArrayOutputStream;
53 import java.io.IOException;
54 import java.io.InputStream;
55 import java.io.PrintStream;
56 import java.net.URISyntaxException;
57 import java.util.ArrayList;
58 import java.util.Collection;
59 import java.util.Collections;
60 import java.util.Date;
61 import java.util.HashMap;
62 import java.util.List;
63 import java.util.Map;
64 import java.util.UUID;
65
66 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
67 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
68
69 /**
70 * This class acts as facade where all the extractors were called on a single document.
71 */
72 public class SingleDocumentExtraction {
73
74 private static final SINDICE vSINDICE = SINDICE.getInstance();
75
76 private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
77
78 private final Configuration configuration;
79
80 private final DocumentSource in;
81
82 private IRI documentIRI;
83
84 private final ExtractorGroup extractors;
85
86 private final TripleHandler output;
87
88 private final EncodingDetector encoderDetector;
89
90 private LocalCopyFactory copyFactory = null;
91
92 private DocumentSource localDocumentSource = null;
93
94 private MIMETypeDetector detector = null;
95
96 private ExtractorGroup matchingExtractors = null;
97
98 private MIMEType detectedMIMEType = null;
99
100 private DocumentReport documentReport = null;
101
102 private ExtractionParameters tagSoupDOMRelatedParameters = null;
103
104 private String parserEncoding = null;
105
106 /**
107 * Builds an extractor by the specification of document source,
108 * list of extractors and output triple handler.
109 *
110 * @param configuration configuration applied during extraction.
111 * @param in input document source.
112 * @param extractors list of extractors to be applied.
113 * @param output output triple handler.
114 */
115 public SingleDocumentExtraction(
116 Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
117 ) {
118 if(configuration == null)
119 throw new NullPointerException("configuration cannot be null.");
120 if(in == null)
121 throw new NullPointerException("in cannot be null.");
122 this.configuration = configuration;
123 this.in = in;
124 this.extractors = extractors;
125
126 List<TripleHandler> tripleHandlers = new ArrayList<>();
127 tripleHandlers.add(output);
128 tripleHandlers.add(new CountingTripleHandler());
129 this.output = new CompositeTripleHandler(tripleHandlers);
130 this.encoderDetector = new TikaEncodingDetector();
131 }
132
133 /**
134 * Builds an extractor by the specification of document source,
135 * extractors factory and output triple handler.
136 *
137 * @param configuration configuration applied during extraction.
138 * @param in input document source.
139 * @param factory the extractors factory.
140 * @param output output triple handler.
141 */
142 public SingleDocumentExtraction(
143 Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
144 ) {
145 this(
146 configuration,
147 in,
148 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
149 output
150 );
151 this.setMIMETypeDetector(null);
152 }
153
154 /**
155 * Builds an extractor by the specification of document source,
156 * extractors factory and output triple handler, using the
157 * {@link org.apache.any23.configuration.DefaultConfiguration}.
158 *
159 * @param in input document source.
160 * @param factory the extractors factory.
161 * @param output output triple handler.
162 */
163 public SingleDocumentExtraction(
164 DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
165 ) {
166 this(
167 DefaultConfiguration.singleton(),
168 in,
169 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
170 output
171 );
172 this.setMIMETypeDetector(null);
173 }
174
175 /**
176 * Sets the internal factory for generating the document local copy,
177 * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
178 *
179 * @param copyFactory local copy factory.
180 * @see org.apache.any23.source.DocumentSource
181 */
182 public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
183 this.copyFactory = copyFactory;
184 }
185
186 /**
187 * Sets the internal mime type detector,
188 * if <code>null</code> mimetype detection will
189 * be skipped and all extractors will be activated.
190 *
191 * @param detector detector instance.
192 */
193 public void setMIMETypeDetector(MIMETypeDetector detector) {
194 this.detector = detector;
195 }
196
197 /**
198 * Triggers the execution of all the {@link Extractor}
199 * registered to this class using the specified extraction parameters.
200 *
201 * @param extractionParameters the parameters applied to the run execution.
202 * @return the report generated by the extraction.
203 * @throws ExtractionException if an error occurred during the data extraction.
204 * @throws IOException if an error occurred during the data access.
205 */
206 public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
207 throws ExtractionException, IOException {
208 if(extractionParameters == null) {
209 extractionParameters = ExtractionParameters.newDefault(configuration);
210 }
211
212 final String contextIRI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
213 ensureHasLocalCopy();
214 try {
215 this.documentIRI = new Any23ValueFactoryWrapper(
216 SimpleValueFactory.getInstance()
217 ).createIRI( "?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
218 } catch (Exception ex) {
219 throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
220 }
221 if(log.isInfoEnabled()) {
222 log.info("Processing " + this.documentIRI);
223 }
224 filterExtractorsByMIMEType();
225
226 if(log.isDebugEnabled()) {
227 StringBuilder sb = new StringBuilder("Extractors ");
228 for (ExtractorFactory<?> factory : matchingExtractors) {
229 sb.append(factory.getExtractorName());
230 sb.append(' ');
231 }
232 sb.append("match ").append(documentIRI);
233 log.debug(sb.toString());
234 }
235
236 final List<ResourceRoot> resourceRoots = new ArrayList<>();
237 final List<PropertyPath> propertyPaths = new ArrayList<>();
238 final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
239 new HashMap<>();
240
241 // Invoke all extractors.
242 try {
243 output.startDocument(documentIRI);
244 } catch (TripleHandlerException e) {
245 log.error(String.format("Error starting document with IRI %s", documentIRI));
246 throw new ExtractionException(String.format("Error starting document with IRI %s", documentIRI),
247 e
248 );
249 }
250 try {
251 output.setContentLength(in.getContentLength());
252 // Create the document context.
253 final String documentLanguage;
254 try {
255 documentLanguage = extractDocumentLanguage(extractionParameters);
256 for (ExtractorFactory<?> factory : matchingExtractors) {
257 @SuppressWarnings("rawtypes")
258 final Extractor extractor = factory.createExtractor();
259 final SingleExtractionReport er = runExtractor(
260 extractionParameters,
261 documentLanguage,
262 extractor
263 );
264 resourceRoots.addAll( er.resourceRoots );
265 propertyPaths.addAll( er.propertyPaths );
266 extractorToIssues.put(factory.getExtractorName(), er.issues);
267 }
268 } catch(ValidatorException ve) {
269 throw new ExtractionException("An error occurred during the validation phase.", ve);
270 }
271
272 // Resource consolidation.
273 final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
274 final ExtractionContext consolidationContext;
275 if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
276 // Consolidation with nesting.
277 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output, documentLanguage);
278 } else {
279 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
280 }
281
282 // Adding time/size meta triples.
283 if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
284 try {
285 addExtractionTimeSizeMetaTriples(consolidationContext);
286 } catch (TripleHandlerException e) {
287 throw new ExtractionException(
288 String.format(
289 "Error while adding extraction metadata triples document with IRI %s", documentIRI
290 ),
291 e
292 );
293 }
294 }
295 } finally {
296 try {
297 output.endDocument(documentIRI);
298 } catch (TripleHandlerException e) {
299 log.error(String.format("Error ending document with IRI %s", documentIRI));
300 throw new ExtractionException(String.format("Error ending document with IRI %s", documentIRI),
301 e
302 );
303 }
304 }
305
306 return new SingleDocumentExtractionReport(
307 documentReport == null
308 ?
309 EmptyValidationReport.getInstance() : documentReport.getReport(),
310 extractorToIssues
311 );
312 }
313
314 /**
315 * Triggers the execution of all the {@link Extractor}
316 * registered to this class using the <i>default</i> extraction parameters.
317 *
318 * @throws IOException if there is an error reading input from the document source
319 * @throws ExtractionException if there is an error duing distraction
320 * @return the extraction report.
321 */
322 public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
323 return run(ExtractionParameters.newDefault(configuration));
324 }
325
326 /**
327 * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
328 *
329 * @return string containing the detected mimetype.
330 * @throws IOException if an error occurred while accessing the data.
331 */
332 public String getDetectedMIMEType() throws IOException {
333 filterExtractorsByMIMEType();
334 return detectedMIMEType == null ? null : detectedMIMEType.toString();
335 }
336
337 /**
338 * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
339 *
340 * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
341 * @throws IOException if there is an error locating matching extractors
342 */
343 public boolean hasMatchingExtractors() throws IOException {
344 filterExtractorsByMIMEType();
345 return !matchingExtractors.isEmpty();
346 }
347
348 /**
349 * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
350 */
351 @SuppressWarnings("rawtypes")
352 public List<Extractor> getMatchingExtractors() {
353 final List<Extractor> extractorsList = new ArrayList<>();
354 for(ExtractorFactory extractorFactory : matchingExtractors) {
355 extractorsList.add( extractorFactory.createExtractor() );
356 }
357 return extractorsList;
358 }
359
360 /**
361 * @return the configured parsing encoding.
362 */
363 public String getParserEncoding() {
364 if(this.parserEncoding == null) {
365 this.parserEncoding = detectEncoding();
366 }
367 return this.parserEncoding;
368 }
369
370 /**
371 * Sets the document parser encoding.
372 *
373 * @param encoding parser encoding.
374 */
375 public void setParserEncoding(String encoding) {
376 this.parserEncoding = encoding;
377 documentReport = null;
378 }
379
380 /**
381 * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
382 *
383 * @return <code>true</code> if the document source is an HTML document.
384 * @throws IOException if an error occurs while accessing data.
385 */
386 private boolean isHTMLDocument() throws IOException {
387 filterExtractorsByMIMEType();
388 return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
389 }
390
391 /**
392 * Extracts the document language where possible.
393 *
394 * @param extractionParameters extraction parameters to be applied to determine the document language.
395 * @return the document language if any, <code>null</code> otherwise.
396 * @throws java.io.IOException if an error occurs during the document analysis.
397 * @throws org.apache.any23.validator.ValidatorException
398 */
399 private String extractDocumentLanguage(ExtractionParameters extractionParameters)
400 throws IOException, ValidatorException {
401 if( ! isHTMLDocument() ) {
402 return null;
403 }
404 final HTMLDocument document;
405 try {
406 document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
407 } catch (IOException ioe) {
408 log.debug("Cannot extract language from document.", ioe);
409 return null;
410 }
411 return document.getDefaultLanguage();
412 }
413
414 /**
415 * Generates a list of extractors that can be applied to the given document.
416 *
417 * @throws IOException
418 */
419 private void filterExtractorsByMIMEType()
420 throws IOException {
421 if (matchingExtractors != null)
422 return; // has already been run.
423
424 if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
425 matchingExtractors = extractors;
426 return;
427 }
428 ensureHasLocalCopy();
429 // detect MIME based on the real file IRI rather than based on given base namespace
430 detectedMIMEType = detector.guessMIMEType(
431 java.net.URI.create(in.getDocumentIRI()).getPath(),
432 localDocumentSource.openInputStream(),
433 MIMEType.parse(localDocumentSource.getContentType())
434 );
435 log.debug("detected media type: " + detectedMIMEType);
436 matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
437 }
438
439 /**
440 * Triggers the execution of a specific {@link Extractor}.
441 *
442 * @param extractionParameters the parameters used for the extraction.
443 * @param extractor the {@link Extractor} to be executed.
444 * @throws ExtractionException if an error specific to an extractor happens.
445 * @throws IOException if an IO error occurs during the extraction.
446 * @return the roots of the resources that have been extracted.
447 * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
448 */
449 private SingleExtractionReport runExtractor(
450 final ExtractionParameters extractionParameters,
451 final String documentLanguage,
452 final Extractor<?> extractor
453 ) throws ExtractionException, IOException, ValidatorException {
454 if(log.isDebugEnabled()) {
455 log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
456 }
457 long startTime = System.currentTimeMillis();
458 final ExtractionContext extractionContext = new ExtractionContext(
459 extractor.getDescription().getExtractorName(),
460 documentIRI,
461 documentLanguage
462 );
463 final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
464 try {
465 if (extractor instanceof BlindExtractor) {
466 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
467 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
468 } else if (extractor instanceof ContentExtractor) {
469 ensureHasLocalCopy();
470 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
471 contentExtractor.run(
472 extractionParameters,
473 extractionContext,
474 localDocumentSource.openInputStream(),
475 extractionResult
476 );
477 } else if (extractor instanceof TagSoupDOMExtractor) {
478 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
479 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
480 tagSoupDOMExtractor.run(
481 extractionParameters,
482 extractionContext,
483 documentReport.getDocument(),
484 extractionResult
485 );
486 } else {
487 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
488 }
489 return
490 new SingleExtractionReport(
491 extractionResult.getIssues(),
492 new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
493 new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
494 );
495 } catch (ExtractionException ex) {
496 if(log.isDebugEnabled()) {
497 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
498 }
499 throw ex;
500 } finally {
501 // Logging result error report.
502 if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
503 ByteArrayOutputStream baos = new ByteArrayOutputStream();
504 extractionResult.printReport(new PrintStream(baos));
505 log.debug(baos.toString());
506 }
507 extractionResult.close();
508
509 long elapsed = System.currentTimeMillis() - startTime;
510 if(log.isDebugEnabled()) {
511 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
512 }
513 }
514 }
515
516 /**
517 * Forces the retrieval of the document data.
518 *
519 * @throws IOException
520 */
521 private void ensureHasLocalCopy() throws IOException {
522 if (localDocumentSource != null)
523 return;
524 if (in.isLocal()) {
525 localDocumentSource = in;
526 return;
527 }
528 if (copyFactory == null) {
529 copyFactory = new MemCopyFactory();
530 }
531 localDocumentSource = copyFactory.createLocalCopy(in);
532 }
533
534 /**
535 * Returns the DOM of the given document source (that must be an HTML stream)
536 * and the report of eventual fixes applied on it.
537 *
538 * @param extractionParameters parameters to be used during extraction.
539 * @return document report.
540 * @throws IOException if an error occurs during data access.
541 * @throws ValidatorException if an error occurs during validation.
542 */
543 private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
544 throws IOException, ValidatorException {
545 if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
546 ensureHasLocalCopy();
547 final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
548 is.mark(Integer.MAX_VALUE);
549 final String candidateEncoding = getParserEncoding();
550 is.reset();
551 final TagSoupParser tagSoupParser = new TagSoupParser(
552 is,
553 documentIRI.stringValue(),
554 candidateEncoding
555 );
556 if(extractionParameters.isValidate()) {
557 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
558 } else {
559 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
560 }
561 tagSoupDOMRelatedParameters = extractionParameters;
562 }
563 return documentReport;
564 }
565
566 /**
567 * Detects the encoding of the local document source input stream.
568 *
569 * @return a valid encoding value.
570 */
571 private String detectEncoding() {
572 try {
573 ensureHasLocalCopy();
574 InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
575 String encoding = this.encoderDetector.guessEncoding(is);
576 is.close();
577 return encoding;
578 } catch (Exception e) {
579 throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
580 }
581 }
582
583 /**
584 * This function verifies if the <i>candidateSub</i> list of strings
585 * is a prefix of <i>list</i>.
586 *
587 * @param list a list of strings.
588 * @param candidateSub a list of strings.
589 * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
590 * <code>false</code> otherwise.
591 */
592 private boolean subPath(String[] list, String[] candidateSub) {
593 if(candidateSub.length > list.length) {
594 return false;
595 }
596 for(int i = 0; i < candidateSub.length; i++) {
597 if( ! candidateSub[i].equals(list[i])) {
598 return false;
599 }
600 }
601 return true;
602 }
603
604 /**
605 * Adds for every resource root node a page domain triple.
606 *
607 * @param resourceRoots list of resource roots.
608 * @param context extraction context to produce triples.
609 * @throws ExtractionException
610 */
611 private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
612 throws ExtractionException {
613 try {
614 // Add source Web domains to every resource root.
615 String domain;
616 try {
617 domain = new java.net.URI(in.getDocumentIRI()).getHost();
618 } catch (URISyntaxException urise) {
619 throw new IllegalArgumentException(
620 "An error occurred while extracting the host from the document IRI.",
621 urise
622 );
623 }
624 if (domain != null) {
625 for (ResourceRoot resourceRoot : resourceRoots) {
626 output.receiveTriple(
627 resourceRoot.getRoot(),
628 vSINDICE.getProperty(SINDICE.DOMAIN),
629 SimpleValueFactory.getInstance().createLiteral(domain),
630 null,
631 context
632 );
633 }
634 }
635 } catch (TripleHandlerException e) {
636 throw new ExtractionException("Error while writing triple triple.", e);
637 } finally {
638 try {
639 output.closeContext(context);
640 } catch (TripleHandlerException e) {
641 throw new ExtractionException("Error while closing context.", e);
642 }
643 }
644 }
645
646 /**
647 * @return an extraction context specific for consolidation triples.
648 */
649 private ExtractionContext createExtractionContext(String defaultLanguage) {
650 return new ExtractionContext(
651 "consolidation-extractor",
652 documentIRI,
653 defaultLanguage,
654 UUID.randomUUID().toString()
655 );
656 }
657
658 /**
659 * Detect the nesting relationship among different
660 * Microformats and explicit them adding connection triples.
661 *
662 * @param resourceRoots
663 * @param propertyPaths
664 * @param context
665 * @throws TripleHandlerException
666 */
667 private void addNestingRelationship(
668 List<ResourceRoot> resourceRoots,
669 List<PropertyPath> propertyPaths,
670 ExtractionContext context
671 ) throws TripleHandlerException {
672 ResourceRoot currentResourceRoot;
673 PropertyPath currentPropertyPath;
674 for (int r = 0; r < resourceRoots.size(); r++) {
675 currentResourceRoot = resourceRoots.get(r);
676 for (int p = 0; p < propertyPaths.size(); p++) {
677 currentPropertyPath = propertyPaths.get(p);
678 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
679 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
680 // Avoid wrong nesting relationships.
681 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
682 continue;
683 }
684 // Avoid self declaring relationships
685 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
686 continue;
687 }
688 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
689 createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
690 }
691 }
692 }
693 }
694
695 /**
696 * This method consolidates the graphs extracted from the same document.
697 * In particular it adds:
698 * <ul>
699 * <li>for every microformat root node a triple indicating the original Web page domain;</li>
700 * <li>triples indicating the nesting relationship among a microformat root and property paths of
701 * other nested microformats.
702 * </li>
703 * </ul>
704 * @param resourceRoots list of RDF nodes representing roots of
705 * extracted microformat graphs and the corresponding HTML paths.
706 * @param propertyPaths list of RDF nodes representing property subjects, property IRIs and the HTML paths
707 * from which such properties have been extracted.
708 * @param addDomainTriples
709 * @param output a triple handler event collector.
710 * @return
711 * @throws ExtractionException
712 */
713 private ExtractionContext consolidateResources(
714 List<ResourceRoot> resourceRoots,
715 List<PropertyPath> propertyPaths,
716 boolean addDomainTriples,
717 TripleHandler output,
718 String defaultLanguage
719 ) throws ExtractionException {
720 final ExtractionContext context = createExtractionContext(defaultLanguage);
721
722 try {
723 output.openContext(context);
724 } catch (TripleHandlerException e) {
725 throw new ExtractionException(
726 String.format("Error starting document with IRI %s", documentIRI),
727 e
728 );
729 }
730
731 try {
732 if(addDomainTriples) {
733 addDomainTriplesPerResourceRoots(resourceRoots, context);
734 }
735 addNestingRelationship(resourceRoots, propertyPaths, context);
736 } catch (TripleHandlerException the) {
737 throw new ExtractionException("Error while writing triple triple.", the);
738 } finally {
739 try {
740 output.closeContext(context);
741 } catch (TripleHandlerException e) {
742 throw new ExtractionException("Error while closing context.", e);
743 }
744 }
745
746 return context;
747 }
748
749 /**
750 * This method consolidates the graphs extracted from the same document.
751 * In particular it adds:
752 * <ul>
753 * <li>for every microformat root node a triple indicating the original Web page domain;</li>
754 * </ul>
755 * @param resourceRoots list of RDF nodes representing roots of
756 * extracted microformat graphs and the corresponding HTML paths.
757 * from which such properties have been extracted.
758 * @param addDomainTriples
759 * @param output a triple handler event collector.
760 * @return
761 * @throws ExtractionException
762 */
763 private ExtractionContext consolidateResources(
764 List<ResourceRoot> resourceRoots,
765 boolean addDomainTriples,
766 TripleHandler output,
767 String defaultLanguage
768 ) throws ExtractionException {
769 final ExtractionContext context = createExtractionContext(defaultLanguage);
770
771 try {
772 output.openContext(context);
773 } catch (TripleHandlerException e) {
774 throw new ExtractionException(
775 String.format("Error starting document with IRI %s", documentIRI),
776 e
777 );
778 }
779
780 try {
781 if(addDomainTriples) {
782 addDomainTriplesPerResourceRoots(resourceRoots, context);
783 }
784 } finally {
785 try {
786 output.closeContext(context);
787 } catch (TripleHandlerException the) {
788 throw new ExtractionException("Error while closing context.", the);
789 }
790 }
791
792 return context;
793 }
794
795 /**
796 * Adds metadata triples containing the number of extracted triples
797 * and the extraction timestamp.
798 *
799 * @param context
800 * @throws TripleHandlerException
801 */
802 private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
803 throws TripleHandlerException {
804 // adding extraction date
805 String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
806 output.receiveTriple(
807 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
808 vSINDICE.getProperty(SINDICE.DATE),
809 SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
810 null,
811 context
812 );
813
814 // adding number of extracted triples
815 int numberOfTriples = 0;
816 CompositeTripleHandler cth = (CompositeTripleHandler) output;
817 for (TripleHandler th : cth.getChilds()) {
818 if (th instanceof CountingTripleHandler) {
819 numberOfTriples = ((CountingTripleHandler) th).getCount();
820 }
821 }
822 output.receiveTriple(
823 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
824 vSINDICE.getProperty(SINDICE.SIZE),
825 SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
826 null,
827 context
828 );
829 }
830
831 /**
832 * Creates a nesting relationship triple.
833 *
834 * @param from the property containing the nested microformat.
835 * @param to the root to the nested microformat.
836 * @param th the triple handler.
837 * @param ec the extraction context used to add such information.
838 * @throws org.apache.any23.writer.TripleHandlerException
839 */
840 private void createNestingRelationship(
841 PropertyPath from,
842 ResourceRoot to,
843 TripleHandler th,
844 ExtractionContext ec
845 ) throws TripleHandlerException {
846 final BNode fromObject = from.getObject();
847 final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
848 BNode bnode = RDFUtils.getBNode(bNodeHash);
849 th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
850 th.receiveTriple(
851 bnode,
852 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
853 from.getObject() == null ? to.getRoot() : from.getObject(),
854 null,
855 ec
856 );
857 th.receiveTriple(
858 from.getSubject(),
859 vSINDICE.getProperty(SINDICE.NESTING),
860 bnode,
861 null,
862 ec
863 );
864 }
865
866 /**
867 * Entity detection report.
868 */
869 private class SingleExtractionReport {
870 private final Collection<IssueReport.Issue> issues;
871 private final List<ResourceRoot> resourceRoots;
872 private final List<PropertyPath> propertyPaths;
873
874 public SingleExtractionReport(
875 Collection<IssueReport.Issue> issues,
876 List<ResourceRoot> resourceRoots,
877 List<PropertyPath> propertyPaths
878 ) {
879 this.issues = issues;
880 this.resourceRoots = resourceRoots;
881 this.propertyPaths = propertyPaths;
882 }
883 }
884
885 }