1
2
3
4
5
6
7
8
9
10
11
12
13
14 package eu.fbk.rdfpro;
15
16 import java.io.IOException;
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.BitSet;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Objects;
26 import java.util.Set;
27 import java.util.concurrent.ConcurrentHashMap;
28 import java.util.function.Consumer;
29
30 import javax.annotation.Nullable;
31
32 import org.openrdf.model.BNode;
33 import org.openrdf.model.Literal;
34 import org.openrdf.model.Resource;
35 import org.openrdf.model.Statement;
36 import org.openrdf.model.URI;
37 import org.openrdf.model.Value;
38 import org.openrdf.model.vocabulary.OWL;
39 import org.openrdf.model.vocabulary.RDF;
40 import org.openrdf.model.vocabulary.XMLSchema;
41 import org.openrdf.rio.RDFHandler;
42 import org.openrdf.rio.RDFHandlerException;
43 import org.slf4j.Logger;
44 import org.slf4j.LoggerFactory;
45
46 import eu.fbk.rdfpro.util.IO;
47 import eu.fbk.rdfpro.util.Namespaces;
48 import eu.fbk.rdfpro.util.Sorter;
49 import eu.fbk.rdfpro.util.Sorter.Input;
50 import eu.fbk.rdfpro.util.Sorter.Output;
51 import eu.fbk.rdfpro.util.Statements;
52 import eu.fbk.rdfpro.vocab.VOID;
53 import eu.fbk.rdfpro.vocab.VOIDX;
54
55 final class ProcessorStats implements RDFProcessor {
56
57 private static final Logger LOGGER = LoggerFactory.getLogger(ProcessorStats.class);
58
59 @Nullable
60 private final String outputNamespace;
61
62 @Nullable
63 private final URI sourceProperty;
64
65 @Nullable
66 private final URI sourceContext;
67
68 private final boolean processCooccurrences;
69
70 private final long threshold;
71
72 ProcessorStats(@Nullable final String outputNamespace, @Nullable final URI sourceProperty,
73 @Nullable final URI sourceContext, @Nullable final Long threshold,
74 final boolean processCooccurrences) {
75 this.outputNamespace = outputNamespace;
76 this.sourceProperty = sourceProperty;
77 this.sourceContext = sourceContext;
78 this.processCooccurrences = processCooccurrences;
79 this.threshold = threshold != null ? threshold : 0;
80 }
81
82 @Override
83 public RDFHandler wrap(final RDFHandler handler) {
84 return new Handler(Objects.requireNonNull(handler));
85 }
86
87 private final class Handler extends AbstractRDFHandler {
88
89 private final RDFHandler handler;
90
91 private final List<SourceStats> sourceList;
92
93 private final Map<URI, SourceStats> sourceMap;
94
95 private final ConcurrentHashMap<URI, URI> sourceInterner;
96
97 private final List<TypeStats> typeList;
98
99 private final Map<URI, TypeStats> typeMap;
100
101 private final List<PropertyStats> propertyList;
102
103 private final Map<URI, PropertyStats> propertyMap;
104
105 private final List<Context> contextList;
106
107 private final Map<Hash, Context> contextMap;
108
109 private final Map<URI, TypeStats.Sampler> samplerMap;
110
111 private final Set<String> mintedURIs;
112
113 private Hash directBlockSubject;
114
115 private final Map<SourceStats, PartialStats> directBlockStats;
116
117 private final Set<PropertyStats.Partition> directBlockPartitions;
118
119 private Hash inverseBlockObject;
120
121 private long inverseBlockVersion;
122
123 private Sorter<Record> sorter;
124
125 private boolean firstPass;
126
127 Handler(final RDFHandler handler) {
128 this.handler = handler;
129 this.sourceList = new ArrayList<SourceStats>();
130 this.sourceMap = new HashMap<URI, SourceStats>();
131 this.sourceInterner = new ConcurrentHashMap<URI, URI>();
132 this.typeList = new ArrayList<TypeStats>();
133 this.typeMap = new HashMap<URI, TypeStats>();
134 this.propertyList = new ArrayList<PropertyStats>();
135 this.propertyMap = new HashMap<URI, PropertyStats>();
136 this.contextList = new ArrayList<Context>();
137 this.contextMap = new HashMap<Hash, Context>();
138 this.samplerMap = new HashMap<URI, TypeStats.Sampler>();
139 this.directBlockSubject = null;
140 this.directBlockStats = new HashMap<SourceStats, PartialStats>();
141 this.directBlockPartitions = new HashSet<PropertyStats.Partition>();
142 this.inverseBlockObject = null;
143 this.inverseBlockVersion = 0L;
144 this.mintedURIs = new HashSet<String>();
145 this.sorter = null;
146 this.firstPass = true;
147
148 PropertyStats ps = new PropertyStats(RDF.TYPE, 0);
149 this.propertyMap.put(RDF.TYPE, ps);
150 this.propertyList.add(ps);
151 }
152
153 @Override
154 public void startRDF() throws RDFHandlerException {
155 this.handler.startRDF();
156 this.mintedURIs.clear();
157 if (this.firstPass) {
158 this.sorter = new Sorter<Record>() {
159
160 @Override
161 protected void encode(final Output writer, final Record record)
162 throws IOException {
163 record.write(writer);
164 }
165
166 @Override
167 protected Record decode(final Input reader) throws IOException {
168 return Record.read(reader);
169 }
170
171 };
172 try {
173 this.sorter.start(true);
174 } catch (final IOException ex) {
175 throw new RDFHandlerException(ex);
176 }
177 }
178 }
179
180 @Override
181 public void handleStatement(final Statement statement) throws RDFHandlerException {
182
183 if (!this.firstPass) {
184 return;
185 }
186
187 final Resource s = statement.getSubject();
188 final URI p = statement.getPredicate();
189 final Value o = statement.getObject();
190 final Resource c = statement.getContext();
191
192 final boolean isURIType = o instanceof URI && p.equals(RDF.TYPE);
193 final Hash sh = Hash.create(s);
194 final Hash oh = isURIType ? null : Hash.create(o);
195
196 PropertyStats ps;
197 synchronized (this.propertyList) {
198 ps = this.propertyMap.get(p);
199 if (ps == null) {
200 ps = new PropertyStats(p, this.propertyList.size());
201 this.propertyMap.put(p, ps);
202 this.propertyList.add(ps);
203 }
204 }
205
206 TypeStats ts = null;
207 if (isURIType) {
208 synchronized (this.typeList) {
209 ts = this.typeMap.get(o);
210 if (ts == null) {
211 ts = new TypeStats((URI) o, this.typeList.size());
212 this.typeMap.put((URI) o, ts);
213 this.typeList.add(ts);
214 }
215 }
216 }
217
218 Context ctx = null;
219 if (c != null) {
220 final Hash ch = Hash.create(c);
221 synchronized (this.contextList) {
222 ctx = this.contextMap.get(ch);
223 if (ctx == null) {
224 ctx = new Context(this.contextList.size());
225 this.contextMap.put(ch, ctx);
226 this.contextList.add(ctx);
227 }
228 ctx.used = true;
229 }
230 }
231
232 if (o instanceof URI
233 && p.equals(ProcessorStats.this.sourceProperty)
234 && (ProcessorStats.this.sourceContext == null || Objects.equals(c,
235 ProcessorStats.this.sourceContext))) {
236 URI source = this.sourceInterner.putIfAbsent((URI) o, (URI) o);
237 source = source != null ? source : (URI) o;
238 Context sctx;
239 synchronized (this.contextList) {
240 sctx = this.contextMap.get(sh);
241 if (sctx == null) {
242 sctx = new Context(this.contextList.size());
243 this.contextMap.put(sh, sctx);
244 this.contextList.add(sctx);
245 }
246 }
247 synchronized (sctx) {
248 if (!Arrays.asList(sctx.sources).contains(source)) {
249 final URI[] array = new URI[sctx.sources.length + 1];
250 System.arraycopy(sctx.sources, 0, array, 0, sctx.sources.length);
251 array[array.length - 1] = source;
252 sctx.sources = array;
253 }
254 }
255 }
256
257 final int pi = ps.index;
258 final int ti = ts == null ? -1 : ts.index;
259 final int ci = ctx == null ? -1 : ctx.index;
260
261 final Record direct = Record.create(false, sh, pi, ti, oh, ci);
262 final Record inverse = isURIType ? null : Record.create(true, null, pi, ti, oh, ci);
263
264 try {
265 this.sorter.emit(direct);
266 if (inverse != null) {
267 this.sorter.emit(inverse);
268 }
269 } catch (final Throwable ex) {
270 throw new RDFHandlerException(ex);
271 }
272
273 synchronized (ps) {
274 if (ps.sampler == null) {
275 ps.sampler = new PropertyStats.Sampler();
276 }
277 ps.sampler.add(statement);
278 }
279
280 if (s instanceof URI) {
281 synchronized (this.samplerMap) {
282 TypeStats.Sampler sampler = this.samplerMap.get(s);
283 if (sampler != null) {
284 sampler.add(statement);
285 if (ts != null && ts.sampler == null) {
286 ts.sampler = sampler;
287 }
288 } else if (ts != null && ts.sampler == null) {
289 sampler = new TypeStats.Sampler();
290 sampler.add(statement);
291 ts.sampler = sampler;
292 this.samplerMap.put((URI) s, sampler);
293 }
294 }
295 }
296 }
297
298 @Override
299 public void endRDF() throws RDFHandlerException {
300 if (this.firstPass) {
301 try {
302 this.typeMap.clear();
303 this.propertyMap.clear();
304 this.contextMap.clear();
305 this.samplerMap.clear();
306 this.sourceInterner.clear();
307
308 final SourceStats s0 = new SourceStats(null, 0);
309 this.sourceMap.put(null, s0);
310 this.sourceList.add(s0);
311
312 for (int i = 0; i < this.contextList.size(); ++i) {
313 final Context ctx = this.contextList.get(i);
314 if (!ctx.used) {
315 this.contextList.set(i, null);
316 } else {
317 for (final URI source : ctx.sources) {
318 SourceStats ss = this.sourceMap.get(source);
319 if (ss == null) {
320 ss = new SourceStats(source, this.sourceList.size());
321 this.sourceMap.put(source, ss);
322 this.sourceList.add(ss);
323 }
324 }
325 }
326 }
327
328 for (final TypeStats ts : this.typeList) {
329 ts.partitions = new TypeStats.Partition[this.sourceList.size()];
330 ts.partitions[0] = new TypeStats.Partition();
331 if (ts.sampler != null) {
332 ts.example = ts.sampler.build();
333 ts.sampler = null;
334 }
335 }
336
337 for (final PropertyStats ps : this.propertyList) {
338 ps.partitions = new PropertyStats.Partition[this.sourceList.size()];
339 ps.partitions[0] = new PropertyStats.Partition();
340 if (ps.sampler != null) {
341 ps.example = ps.sampler.build();
342 ps.sampler = null;
343 }
344 }
345
346 LOGGER.debug("Status: {} properties, {} types, {} contexts, " + "{} sources",
347 this.propertyList.size(), this.typeList.size(),
348 this.contextMap.size(), this.sourceList.size());
349
350 this.sorter.end(false, new Consumer<Record>() {
351
352 @Override
353 public void accept(final Record record) {
354 if (record.inverse) {
355 handleInverseRecord(record);
356 } else {
357 handleDirectRecord(record);
358 }
359 }
360
361 });
362 this.sorter = null;
363 handleDirectRecord(null);
364
365 } catch (final IOException ex) {
366 throw new RDFHandlerException(ex);
367 }
368 }
369 emitStatistics();
370 this.handler.endRDF();
371 this.firstPass = false;
372 }
373
374 @Override
375 public void close() {
376 IO.closeQuietly(this.sorter);
377 IO.closeQuietly(this.handler);
378 }
379
380 private void handleDirectRecord(@Nullable final Record record) {
381
382 if (record == null || !record.subject.equals(this.directBlockSubject)) {
383 for (final Map.Entry<SourceStats, PartialStats> e : this.directBlockStats
384 .entrySet()) {
385 final int index = e.getKey().index;
386 final PartialStats s = e.getValue();
387 if (s.tss != null) {
388 for (final TypeStats ts : s.tss) {
389 TypeStats.Partition tp = ts.partitions[index];
390 if (tp == null) {
391 tp = new TypeStats.Partition();
392 ts.partitions[index] = tp;
393 }
394 tp.triples += s.triples;
395 tp.tboxTriples += s.tboxTriples;
396 tp.aboxTriples += s.aboxTriples;
397 tp.typeTriples += s.typeTriples;
398 tp.sameAsTriples += s.sameAsTriples;
399 tp.predicates += s.pss == null ? 0 : s.pss.size();
400 tp.entities += s.entities;
401 if (ProcessorStats.this.processCooccurrences) {
402 tp.types = tp.types != null ? tp.types : new BitSet();
403 tp.properties = tp.properties != null ? tp.properties
404 : new BitSet();
405 if (s.types != null) {
406 tp.types.or(s.types);
407 }
408 if (s.properties != null) {
409 tp.properties.or(s.properties);
410 }
411 }
412 }
413 }
414 }
415 this.directBlockStats.clear();
416 this.directBlockPartitions.clear();
417 if (record == null) {
418 return;
419 }
420 this.directBlockSubject = record.subject;
421 }
422
423 if (record.object != null) {
424 final boolean isLiteral = record.object.isLiteral();
425 final PropertyStats ps = this.propertyList.get(record.property);
426 if (ps.detectedType == null) {
427 ps.detectedType = isLiteral ? OWL.DATATYPEPROPERTY : OWL.OBJECTPROPERTY;
428 } else if (ps.detectedType == OWL.DATATYPEPROPERTY && !isLiteral
429 || ps.detectedType == OWL.OBJECTPROPERTY && isLiteral) {
430 ps.detectedType = RDF.PROPERTY;
431 }
432 }
433
434 handleDirectRecordHelper(record, this.sourceList.get(0));
435 if (record.context >= 0) {
436 final Context ctx = this.contextList.get(record.context);
437 for (final URI source : ctx.sources) {
438 handleDirectRecordHelper(record, this.sourceMap.get(source));
439 }
440 }
441 }
442
443 private void handleDirectRecordHelper(final Record record, final SourceStats ss) {
444
445 final boolean isEntity = record.subject.isURI();
446
447 PartialStats s = this.directBlockStats.get(ss);
448 if (s == null) {
449 s = new PartialStats();
450 this.directBlockStats.put(ss, s);
451 if (isEntity) {
452 ++ss.entities;
453 ++s.entities;
454 }
455 if (ProcessorStats.this.processCooccurrences) {
456 ss.types = ss.types != null ? ss.types : new BitSet();
457 ss.properties = ss.properties != null ? ss.properties : new BitSet();
458 s.types = new BitSet();
459 s.properties = new BitSet();
460 }
461 }
462
463 ++ss.triples;
464 ++s.triples;
465
466 if (record.type >= 0) {
467 final TypeStats ts = this.typeList.get(record.type);
468 s.tss = s.tss != null ? s.tss : new HashSet<TypeStats>();
469 s.tss.add(ts);
470 if (Statements.TBOX_CLASSES.contains(ts.type)) {
471 ++ss.tboxTriples;
472 ++s.tboxTriples;
473 } else {
474 ++ss.aboxTriples;
475 ++s.aboxTriples;
476 ++ss.typeTriples;
477 ++s.typeTriples;
478 }
479 if (ProcessorStats.this.processCooccurrences) {
480 ss.types.set(ts.index);
481 s.types.set(ts.index);
482 }
483 }
484
485 final PropertyStats ps = this.propertyList.get(record.property);
486 s.pss = s.pss != null ? s.pss : new HashSet<PropertyStats>();
487 s.pss.add(ps);
488 PropertyStats.Partition pp = ps.partitions[ss.index];
489 if (pp == null) {
490 pp = new PropertyStats.Partition();
491 ps.partitions[ss.index] = pp;
492 }
493 ++pp.triples;
494 if (this.directBlockPartitions.add(pp)) {
495 ++pp.distinctSubjects;
496 pp.entities += isEntity ? 1 : 0;
497 }
498
499 if (record.type < 0) {
500 if (Statements.TBOX_PROPERTIES.contains(ps.property)) {
501 ++ss.tboxTriples;
502 ++s.tboxTriples;
503 } else {
504 ++ss.aboxTriples;
505 ++s.aboxTriples;
506 if (ps.property.equals(OWL.SAMEAS)) {
507 ++ss.sameAsTriples;
508 ++s.sameAsTriples;
509 }
510 }
511 if (ProcessorStats.this.processCooccurrences) {
512 ss.properties.set(ps.index);
513 s.properties.set(ps.index);
514 }
515 }
516 }
517
518 private void handleInverseRecord(final Record record) {
519 if (!record.object.equals(this.inverseBlockObject)) {
520 ++this.inverseBlockVersion;
521 this.inverseBlockObject = record.object;
522 }
523 final PropertyStats ps = this.propertyList.get(record.property);
524 final PropertyStats.Partition p0 = ps.partitions[0];
525 if (p0.version < this.inverseBlockVersion) {
526 p0.version = this.inverseBlockVersion;
527 ++p0.distinctObjects;
528 }
529 if (record.context >= 0) {
530 final Context ctx = this.contextList.get(record.context);
531 for (final URI source : ctx.sources) {
532 final SourceStats ss = this.sourceMap.get(source);
533 PropertyStats.Partition p = ps.partitions[ss.index];
534 if (p == null) {
535 p = new PropertyStats.Partition();
536 ps.partitions[ss.index] = p;
537 } else if (p.version == this.inverseBlockVersion) {
538 continue;
539 }
540 ++p.distinctObjects;
541 }
542 }
543 }
544
545 private void emitStatistics() throws RDFHandlerException {
546
547 this.handler.handleNamespace(VOID.PREFIX, VOID.NAMESPACE);
548 this.handler.handleNamespace(VOIDX.PREFIX, VOIDX.NAMESPACE);
549
550 final Map<URI, URI> spURIs = new HashMap<URI, URI>();
551 for (final SourceStats s : this.sourceList) {
552 final URI uri = mintURI(s.source != null ? s.source : VOID.DATASET);
553 final String label = Statements.formatValue(uri, Namespaces.DEFAULT) + " ("
554 + s.entities + ", " + s.triples + ")";
555 spURIs.put(s.source, uri);
556 emit(uri, RDF.TYPE, VOID.DATASET);
557 emit(uri, VOIDX.LABEL, label);
558 emit(uri, VOIDX.SOURCE, s.source);
559 emit(uri, VOID.ENTITIES, s.entities);
560 emit(uri, VOID.TRIPLES, s.triples);
561 emit(uri, VOIDX.TBOX_TRIPLES, s.tboxTriples);
562 emit(uri, VOIDX.ABOX_TRIPLES, s.aboxTriples);
563 emit(uri, VOIDX.TYPE_TRIPLES, s.typeTriples);
564 emit(uri, VOIDX.SAME_AS_TRIPLES, s.sameAsTriples);
565 if (s.types != null) {
566 emit(uri, VOID.CLASSES, s.types.cardinality());
567 }
568 if (s.properties != null) {
569 emit(uri, VOID.PROPERTIES, s.properties.cardinality());
570 }
571 }
572
573 for (final TypeStats ts : this.typeList) {
574 final TypeStats.Partition p0 = ts.partitions[0];
575 if (p0.entities < ProcessorStats.this.threshold) {
576 continue;
577 }
578 final String label = Statements.formatValue(ts.type, Namespaces.DEFAULT) + " ("
579 + p0.entities + ")";
580 emit(ts.type, VOIDX.LABEL, label);
581 if (ts.example != null) {
582 emit(ts.type, VOIDX.EXAMPLE, ts.example);
583 }
584 for (int i = 0; i < ts.partitions.length; ++i) {
585 final TypeStats.Partition p = ts.partitions[i];
586 if (p != null && p.entities >= ProcessorStats.this.threshold) {
587 final URI source = this.sourceList.get(i).source;
588 final URI spURI = spURIs.get(source);
589 final URI tpURI = mintURI(source, ts.type);
590 final String tpLabel = Statements.formatValue(tpURI, Namespaces.DEFAULT)
591 + " (" + p.entities + ", C)";
592 emit(ts.type, p == p0 ? VOIDX.GLOBAL_STATS : VOIDX.SOURCE_STATS, tpURI);
593 emit(spURI, VOID.CLASS_PARTITION, tpURI);
594 emit(tpURI, RDF.TYPE, VOID.DATASET);
595 emit(tpURI, VOIDX.LABEL, tpLabel);
596 emit(tpURI, VOIDX.SOURCE, source);
597 emit(tpURI, VOID.CLASS, ts.type);
598 emit(tpURI, VOID.ENTITIES, p.entities);
599 emit(tpURI, VOID.TRIPLES, p.triples);
600 emit(tpURI, VOIDX.TBOX_TRIPLES, p.tboxTriples);
601 emit(tpURI, VOIDX.ABOX_TRIPLES, p.aboxTriples);
602 emit(tpURI, VOIDX.TYPE_TRIPLES, p.typeTriples);
603 emit(tpURI, VOIDX.SAME_AS_TRIPLES, p.sameAsTriples);
604 if (p.types != null) {
605 emit(tpURI, VOID.CLASSES, p.types.cardinality());
606 }
607 if (p.properties != null) {
608 emit(tpURI, VOID.PROPERTIES, p.properties.cardinality());
609 }
610 if (p.entities > 0) {
611 emit(tpURI, VOIDX.AVERAGE_PROPERTIES, (double) p.predicates
612 / p.entities);
613 }
614 }
615 }
616 }
617
618 for (final PropertyStats ps : this.propertyList) {
619 final PropertyStats.Partition p0 = ps.partitions[0];
620 if (p0.triples < ProcessorStats.this.threshold) {
621 continue;
622 }
623 final boolean isTBox = Statements.TBOX_PROPERTIES.contains(ps.property);
624 final boolean isType = ps.property.equals(RDF.TYPE);
625 final boolean isSameAs = ps.property.equals(OWL.SAMEAS);
626 final boolean fun = p0.triples > 0 && p0.triples == p0.distinctSubjects;
627 final boolean invfun = p0.triples > 0 && p0.triples == p0.distinctObjects;
628 final boolean data = OWL.DATATYPEPROPERTY.equals(ps.detectedType);
629 final boolean object = OWL.OBJECTPROPERTY.equals(ps.detectedType);
630 final String label = String.format("%s (%d, %s%s%s)", Statements.formatValue(
631 ps.property, Namespaces.DEFAULT), p0.triples, data ? "D" : object ? "O"
632 : "P", fun ? "F" : "", invfun ? "I" : "");
633 emit(ps.property, VOIDX.LABEL, label);
634 emit(ps.property, VOIDX.TYPE, ps.detectedType);
635 if (fun) {
636 emit(ps.property, VOIDX.TYPE, OWL.FUNCTIONALPROPERTY);
637 }
638 if (invfun) {
639 emit(ps.property, VOIDX.TYPE, OWL.INVERSEFUNCTIONALPROPERTY);
640 }
641 if (ps.example != null) {
642 emit(ps.property, VOIDX.EXAMPLE, ps.example);
643 }
644 for (int i = 0; i < ps.partitions.length; ++i) {
645 final PropertyStats.Partition p = ps.partitions[i];
646 if (p != null && p.triples >= ProcessorStats.this.threshold) {
647 final URI source = this.sourceList.get(i).source;
648 final URI spURI = spURIs.get(source);
649 final URI ppURI = mintURI(source, ps.property);
650 final boolean ppFun = p.triples > 0 && p.triples == p.distinctSubjects;
651 final boolean ppInvfun = p.triples > 0 && p.triples == p.distinctObjects;
652 final String ppLabel = String.format("%s (%d, %s%s%s)", Statements
653 .formatValue(ppURI, Namespaces.DEFAULT), p.triples, data ? "D"
654 : object ? "O" : "P", ppFun ? "F" : "", ppInvfun ? "I" : "");
655 emit(ps.property, p == p0 ? VOIDX.GLOBAL_STATS : VOIDX.SOURCE_STATS, ppURI);
656 emit(spURI, VOID.PROPERTY_PARTITION, ppURI);
657 emit(ppURI, RDF.TYPE, VOID.DATASET);
658 emit(ppURI, VOIDX.LABEL, ppLabel);
659 emit(ppURI, VOIDX.SOURCE, source);
660 emit(ppURI, VOID.PROPERTY, ps.property);
661 emit(ppURI, VOID.CLASSES, 0);
662 emit(ppURI, VOID.PROPERTIES, 1);
663 emit(ppURI, VOID.ENTITIES, p.entities);
664 emit(ppURI, VOID.TRIPLES, p.triples);
665 emit(ppURI, VOIDX.TBOX_TRIPLES, isTBox ? p.triples : 0);
666 emit(ppURI, VOIDX.ABOX_TRIPLES, isTBox ? 0 : p.triples);
667 emit(ppURI, VOIDX.TYPE_TRIPLES, isType ? p.triples : 0);
668 emit(ppURI, VOIDX.SAME_AS_TRIPLES, isSameAs ? p.triples : 0);
669 emit(ppURI, VOID.DISTINCT_SUBJECTS, p.distinctSubjects);
670 emit(ppURI, VOID.DISTINCT_OBJECTS, p.distinctObjects);
671 }
672 }
673 }
674
675 for (final URI term : VOID.TERMS) {
676 emit(term, VOIDX.LABEL, Statements.formatValue(term, Namespaces.DEFAULT));
677 }
678 for (final URI term : VOIDX.TERMS) {
679 emit(term, VOIDX.LABEL, Statements.formatValue(term, Namespaces.DEFAULT));
680 }
681 }
682
683 private void emit(@Nullable final Resource subject, @Nullable final URI predicate,
684 @Nullable final Object object) throws RDFHandlerException {
685
686 Value value = null;
687
688 if (subject != null && predicate != null) {
689 if (object instanceof Value) {
690 value = (Value) object;
691 } else if (object instanceof Integer && ((Integer) object).intValue() != 0) {
692 value = Statements.VALUE_FACTORY.createLiteral((Integer) object);
693 } else if (object instanceof Long && ((Long) object).longValue() != 0L) {
694 value = Statements.VALUE_FACTORY.createLiteral((Long) object);
695 } else if (object instanceof Double && ((Double) object).doubleValue() != 0.0) {
696 value = Statements.VALUE_FACTORY.createLiteral((Double) object);
697 } else if (object instanceof String && !((String) object).isEmpty()) {
698 value = Statements.VALUE_FACTORY.createLiteral((String) object,
699 XMLSchema.STRING);
700 }
701 }
702
703 if (value != null) {
704 this.handler.handleStatement(Statements.VALUE_FACTORY.createStatement(subject,
705 predicate, value));
706 }
707 }
708
709 private URI mintURI(final URI... inputURIs) {
710 final StringBuilder builder = new StringBuilder();
711 if (ProcessorStats.this.outputNamespace != null) {
712 builder.append(ProcessorStats.this.outputNamespace);
713 } else {
714 builder.append("stats:");
715 }
716 boolean started = false;
717 for (final URI uri : inputURIs) {
718 if (uri != null) {
719 if (started) {
720 builder.append("_");
721 }
722 started = true;
723 builder.append(uri.getLocalName());
724 }
725 }
726 final String base = builder.toString();
727 for (int i = 0; i < 1000; ++i) {
728 final String candidate = i == 0 ? base : base + "_" + i;
729 if (this.mintedURIs.add(candidate)) {
730 return Statements.VALUE_FACTORY.createURI(candidate);
731 }
732 }
733 throw new Error();
734 }
735
736 }
737
738 private static final class PartialStats {
739
740 @Nullable
741 Set<TypeStats> tss;
742
743 @Nullable
744 Set<PropertyStats> pss;
745
746 @Nullable
747 BitSet types;
748
749 @Nullable
750 BitSet properties;
751
752 long entities;
753
754 long triples;
755
756 long tboxTriples;
757
758 long aboxTriples;
759
760 long typeTriples;
761
762 long sameAsTriples;
763
764 }
765
766 private static final class SourceStats {
767
768 @Nullable
769 final URI source;
770
771 final int index;
772
773 @Nullable
774 BitSet types;
775
776 @Nullable
777 BitSet properties;
778
779 long entities;
780
781 long triples;
782
783 long tboxTriples;
784
785 long aboxTriples;
786
787 long typeTriples;
788
789 long sameAsTriples;
790
791 SourceStats(final URI source, final int index) {
792 this.source = source;
793 this.index = index;
794 this.types = null;
795 this.properties = null;
796 this.entities = 0;
797 this.triples = 0;
798 this.tboxTriples = 0;
799 this.aboxTriples = 0;
800 this.typeTriples = 0;
801 this.sameAsTriples = 0;
802 }
803
804 }
805
806 private static final class TypeStats {
807
808 @Nullable
809 final URI type;
810
811 final int index;
812
813 @Nullable
814 Sampler sampler;
815
816 @Nullable
817 String example;
818
819 @Nullable
820 Partition[] partitions;
821
822 TypeStats(@Nullable final URI type, final int index) {
823 this.type = type;
824 this.index = index;
825 }
826
827 static final class Partition {
828
829 BitSet types;
830
831 BitSet properties;
832
833 long entities;
834
835 long triples;
836
837 long tboxTriples;
838
839 long aboxTriples;
840
841 long typeTriples;
842
843 long sameAsTriples;
844
845 long predicates;
846
847 }
848
849 static class Sampler {
850
851 private static final int MAX_VALUE_LENGTH = 40;
852
853 private static final int MAX_STATEMENTS = 20;
854
855 private URI id;
856
857 private final List<Value> data;
858
859 Sampler() {
860 this.data = new ArrayList<Value>();
861 }
862
863 synchronized void add(final Statement statement) {
864 if (this.data.size() < MAX_STATEMENTS * 2) {
865 this.id = (URI) statement.getSubject();
866 this.data.add(statement.getPredicate());
867 this.data.add(statement.getObject());
868 }
869 }
870
871 synchronized String build() {
872 final List<String> lines = new ArrayList<String>();
873 for (int i = 0; i < this.data.size(); i += 2) {
874 final String predicate = Statements.formatValue(this.data.get(i),
875 Namespaces.DEFAULT);
876 final String object = Statements.formatValue(
877 Statements.shortenValue(this.data.get(i + 1), MAX_VALUE_LENGTH),
878 Namespaces.DEFAULT);
879 lines.add(predicate + " " + object);
880 }
881 Collections.sort(lines);
882 final StringBuilder builder = new StringBuilder(Statements.formatValue(this.id,
883 Namespaces.DEFAULT));
884 for (int i = 0; i < lines.size(); ++i) {
885 builder.append("\n ").append(lines.get(i));
886 builder.append(i < lines.size() - 1 ? ';' : '.');
887 }
888 return builder.toString();
889 }
890
891 }
892
893 }
894
895 private static final class PropertyStats {
896
897 @Nullable
898 final URI property;
899
900 final int index;
901
902 @Nullable
903 Sampler sampler;
904
905 @Nullable
906 String example;
907
908 @Nullable
909 URI detectedType;
910
911 @Nullable
912 Partition[] partitions;
913
914 PropertyStats(final URI property, final int index) {
915 this.property = property;
916 this.index = index;
917 this.detectedType = null;
918 }
919
920 static final class Partition {
921
922 long entities;
923
924 long triples;
925
926 long distinctSubjects;
927
928 long distinctObjects;
929
930 long version;
931
932 }
933
934 static final class Sampler {
935
936 private static final int MAX_VALUE_LENGTH = 40;
937
938 private static final int MAX_STATEMENTS = 3;
939
940 private final Statement[] statements;
941
942 private boolean haveBNode;
943
944 private boolean haveLiteral;
945
946 private boolean haveURI;
947
948 private int size;
949
950 Sampler() {
951 this.statements = new Statement[MAX_STATEMENTS];
952 this.haveBNode = false;
953 this.haveLiteral = false;
954 this.haveURI = false;
955 this.size = 0;
956 }
957
958 synchronized void add(final Statement statement) {
959
960 final Resource s = statement.getSubject();
961 final Value o = statement.getObject();
962 final boolean isURI = o instanceof URI;
963 final boolean isBNode = o instanceof BNode;
964 final boolean isLiteral = o instanceof Literal;
965
966 if (!(s instanceof URI)
967 || this.size == this.statements.length
968 && (isURI && this.haveURI || isBNode && this.haveBNode || isLiteral
969 && this.haveLiteral)) {
970 return;
971 }
972
973 int index = -1;
974 for (int i = 0; i < this.statements.length; ++i) {
975 final Statement stmt = this.statements[i];
976 if (stmt == null) {
977 index = i;
978 ++this.size;
979 break;
980 } else if (stmt.equals(statement)) {
981 return;
982 } else if (!this.haveURI && isURI
983 || !this.haveBNode && isBNode
984 || !this.haveLiteral && isLiteral) {
985 index = i;
986 break;
987 }
988 }
989 if (index >= 0) {
990 this.statements[index] = statement;
991 this.haveURI |= isURI;
992 this.haveBNode |= isBNode;
993 this.haveLiteral |= isLiteral;
994 }
995 }
996
997 synchronized String build() {
998 final StringBuilder builder = new StringBuilder();
999 for (final Statement statement : this.statements) {
1000 if (statement != null) {
1001 builder.append("\n ")
1002 .append(Statements.formatValue(statement.getSubject(),
1003 Namespaces.DEFAULT))
1004 .append(" ")
1005 .append(Statements.formatValue(statement.getPredicate(),
1006 Namespaces.DEFAULT))
1007 .append(" ")
1008 .append(Statements.formatValue(Statements.shortenValue(
1009 statement.getObject(), MAX_VALUE_LENGTH),
1010 Namespaces.DEFAULT)).append(" .");
1011 }
1012 }
1013 return builder.toString();
1014 }
1015
1016 }
1017
1018 }
1019
1020 private static final class Context {
1021
1022 private static final URI[] EMPTY = new URI[0];
1023
1024 final int index;
1025
1026 URI[] sources;
1027
1028 boolean used;
1029
1030 Context(final int index) {
1031 this.index = index;
1032 this.sources = EMPTY;
1033 this.used = false;
1034 }
1035
1036 }
1037
1038 private static final class Record {
1039
1040 final boolean inverse;
1041
1042 @Nullable
1043 final Hash subject;
1044
1045 final int property;
1046
1047 final int type;
1048
1049 @Nullable
1050 final Hash object;
1051
1052 final int context;
1053
1054 private Record(final boolean inverse, final Hash subject, final int predicate,
1055 final int type, final Hash object, final int context) {
1056 assert !inverse || object != null;
1057 assert inverse || subject != null;
1058 this.inverse = inverse;
1059 this.subject = subject;
1060 this.property = predicate;
1061 this.type = type;
1062 this.object = object;
1063 this.context = context;
1064
1065 }
1066
1067 public static Record create(final boolean inverse, @Nullable final Hash subject,
1068 final int predicate, final int type, @Nullable final Hash object,
1069 @Nullable final int context) {
1070 return new Record(inverse, subject, predicate, type, object, context);
1071 }
1072
1073 public static Record read(final Input reader) throws IOException {
1074
1075 final Hash hash = Hash.read(reader);
1076
1077 boolean inverse = false;
1078 Hash subject = null;
1079 Hash object = null;
1080 int predicate = -1;
1081 int type = -1;
1082 int context = -1;
1083
1084 final int c = (int) reader.readNumber();
1085 final boolean hasContext = (c & 0x01) != 0;
1086 final int format = c & 0xE;
1087
1088 if (format == 8) {
1089
1090 inverse = true;
1091 object = hash;
1092 predicate = (int) reader.readNumber();
1093 } else if (format == 4) {
1094
1095 subject = hash;
1096 type = (int) reader.readNumber();
1097 predicate = 0;
1098 } else if (format == 2) {
1099
1100 subject = hash;
1101 predicate = (int) reader.readNumber();
1102 object = Hash.read(reader);
1103 } else {
1104 throw new Error("format is " + format);
1105 }
1106
1107 if (hasContext) {
1108 context = (int) reader.readNumber();
1109 }
1110
1111 return create(inverse, subject, predicate, type, object, context);
1112 }
1113
1114 public void write(final Output writer) throws IOException {
1115
1116 final int flag = this.context >= 0 ? 1 : 0;
1117
1118 if (this.inverse) {
1119
1120 this.object.write(writer);
1121 writer.writeNumber(flag + 8);
1122 writer.writeNumber(this.property);
1123 } else if (this.object == null) {
1124
1125 this.subject.write(writer);
1126 writer.writeNumber(flag + 4);
1127 writer.writeNumber(this.type);
1128 } else {
1129
1130 this.subject.write(writer);
1131 writer.writeNumber(flag + 2);
1132 writer.writeNumber(this.property);
1133 this.object.write(writer);
1134 }
1135
1136 if (this.context >= 0) {
1137 writer.writeNumber(this.context);
1138 }
1139 }
1140
1141 }
1142
1143
1144
1145 private static final class Hash {
1146
1147 private static final int MAX_LENGTH = 4 * 1024;
1148
1149 private static final int TABLE_SIZE = 4 * 1024 - 1;
1150
1151 private static final Hash[] TABLE_HASHES = new Hash[TABLE_SIZE];
1152
1153 private static final Value[] TABLE_VALUES = new Value[TABLE_SIZE];
1154
1155 private static final Index<URI> DATATYPE_INDEX = new Index<URI>(1024);
1156
1157 private static final Index<String> LANGUAGE_INDEX = new Index<String>(1024);
1158
1159 private final long lo;
1160
1161 private final long hi;
1162
1163 public static Hash read(final Input reader) throws IOException {
1164 final long lo = reader.readNumber();
1165 final long hi = reader.readNumber();
1166 return new Hash(lo, hi);
1167 }
1168
1169 public static Hash create(final Value value) {
1170 if (value.stringValue().length() > MAX_LENGTH) {
1171 return compute(value);
1172 }
1173 final int index = (value.hashCode() & 0x7FFFFFFF) % TABLE_SIZE;
1174 synchronized (TABLE_VALUES) {
1175 if (value.equals(TABLE_VALUES[index])) {
1176 return TABLE_HASHES[index];
1177 }
1178 }
1179 final Hash hash = compute(value);
1180 synchronized (TABLE_VALUES) {
1181 TABLE_VALUES[index] = value;
1182 TABLE_HASHES[index] = hash;
1183 }
1184 return hash;
1185 }
1186
1187 private static Hash compute(final Value value) {
1188
1189 final String string = value.stringValue();
1190
1191 boolean doHash = true;
1192 long lo = 0;
1193 long hi = 0;
1194
1195 final int length = string.length();
1196 if (length <= 15) {
1197 doHash = false;
1198 long cur = 0;
1199 for (int i = 0; i < 16; ++i) {
1200 int c = 1;
1201 if (i < length) {
1202 c = string.charAt(i);
1203 if (c <= 0 || c >= 128) {
1204 doHash = true;
1205 break;
1206 }
1207 }
1208 cur = cur << 8 | c;
1209 if (i == 7) {
1210 lo = cur;
1211 cur = 0;
1212 }
1213 }
1214 hi = cur;
1215 }
1216
1217 if (doHash) {
1218 final eu.fbk.rdfpro.util.Hash hash = eu.fbk.rdfpro.util.Hash.murmur3(string);
1219 lo = hash.getLow();
1220 hi = hash.getHigh();
1221 }
1222
1223 lo = (lo & 0x7F7F7F7F7F7F7F7FL) + 0x0101010101010101L;
1224 lo = lo & 0x7F7F7F7F7F7F7F7FL | (lo & 0x8080808080808080L) >> 1;
1225 hi = (hi & 0x7F7F7F7F7F7F7F7FL) + 0x0101010101010101L;
1226 hi = hi & 0x7F7F7F7F7F7F7F7FL | (hi & 0x8080808080808080L) >> 1;
1227 hi = hi & 0x0FFFFFFFFFFFFFFFL | 0x4000000000000000L;
1228
1229 if (value instanceof URI) {
1230 hi = hi | 0x3000000000000000L;
1231 } else if (value instanceof BNode) {
1232 hi = hi | 0x2000000000000000L;
1233 } else if (value instanceof Literal) {
1234 hi = hi | 0x1000000000000000L;
1235 final Literal literal = (Literal) value;
1236 int index = 0;
1237 if (literal.getLanguage() != null) {
1238 index = LANGUAGE_INDEX.put(literal.getLanguage()) | 0x40000000;
1239 } else if (literal.getDatatype() != null) {
1240 index = DATATYPE_INDEX.put(literal.getDatatype());
1241 }
1242 index = index & 0x7FFFFFFF;
1243 lo = (lo ^ index) & 0xFFFFFFFF7F7F7F7FL;
1244 if ((lo & 0xFFL) == 0L) {
1245 lo = lo | 0x01L;
1246 }
1247 if ((lo & 0xFF00L) == 0L) {
1248 lo = lo | 0x0100L;
1249 }
1250 if ((lo & 0xFF0000L) == 0L) {
1251 lo = lo | 0x010000L;
1252 }
1253 if ((lo & 0xFF000000L) == 0L) {
1254 lo = lo | 0x01000000L;
1255 }
1256 }
1257
1258 return new Hash(lo, hi);
1259 }
1260
1261 private Hash(final long lo, final long hi) {
1262 this.lo = lo;
1263 this.hi = hi;
1264 }
1265
1266 public boolean isURI() {
1267 return (this.hi & 0x3000000000000000L) == 0x3000000000000000L;
1268 }
1269
1270 public boolean isLiteral() {
1271 return (this.hi & 0x3000000000000000L) == 0x1000000000000000L;
1272 }
1273
1274 @Override
1275 public boolean equals(final Object object) {
1276 if (object == this) {
1277 return true;
1278 }
1279 if (!(object instanceof Hash)) {
1280 return false;
1281 }
1282 final Hash other = (Hash) object;
1283 return this.lo == other.lo && this.hi == other.hi;
1284 }
1285
1286 @Override
1287 public int hashCode() {
1288 final int hh = (int) (this.hi >> 32);
1289 final int hl = (int) this.hi;
1290 final int lh = (int) (this.lo >> 32);
1291 final int ll = (int) this.lo;
1292 return ((hh * 37 + hl) * 37 + lh) * 37 + ll;
1293 }
1294
1295 public void write(final Output writer) throws IOException {
1296 writer.writeNumber(this.lo);
1297 writer.writeNumber(this.hi);
1298 }
1299
1300 }
1301
1302 private static final class Index<T> {
1303
1304 private final Map<T, Integer> map;
1305
1306 private final List<T> list;
1307
1308 private final int size;
1309
1310 Index(final int size) {
1311 final int capacity = Math.min(size, 1024);
1312 this.map = new HashMap<T, Integer>(capacity);
1313 this.list = new ArrayList<T>(capacity);
1314 this.size = size;
1315 }
1316
1317 @Nullable
1318 synchronized Integer put(final T element) {
1319 Integer index = this.map.get(element);
1320 if (index == null && this.list.size() < this.size) {
1321 index = this.list.size() + 1;
1322 this.list.add(element);
1323 this.map.put(element, index);
1324 }
1325 return index;
1326 }
1327
1328 @Nullable
1329 synchronized T get(final int index) {
1330 return this.list.get(index - 1);
1331 }
1332
1333 }
1334
1335 }