1   /*
2    * RDFpro - An extensible tool for building stream-oriented RDF processing libraries.
3    * 
4    * Written in 2014 by Francesco Corcoglioniti with support by Marco Amadori, Michele Mostarda,
5    * Alessio Palmero Aprosio and Marco Rospocher. Contact info on http://rdfpro.fbk.eu/
6    * 
7    * To the extent possible under law, the authors have dedicated all copyright and related and
8    * neighboring rights to this software to the public domain worldwide. This software is
9    * distributed without any warranty.
10   * 
11   * You should have received a copy of the CC0 Public Domain Dedication along with this software.
12   * If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
13   */
14  package eu.fbk.rdfpro;
15  
16  import java.io.IOException;
17  import java.util.ArrayList;
18  import java.util.Arrays;
19  import java.util.BitSet;
20  import java.util.Collections;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Objects;
26  import java.util.Set;
27  import java.util.concurrent.ConcurrentHashMap;
28  import java.util.function.Consumer;
29  
30  import javax.annotation.Nullable;
31  
32  import org.openrdf.model.BNode;
33  import org.openrdf.model.Literal;
34  import org.openrdf.model.Resource;
35  import org.openrdf.model.Statement;
36  import org.openrdf.model.URI;
37  import org.openrdf.model.Value;
38  import org.openrdf.model.vocabulary.OWL;
39  import org.openrdf.model.vocabulary.RDF;
40  import org.openrdf.model.vocabulary.XMLSchema;
41  import org.openrdf.rio.RDFHandler;
42  import org.openrdf.rio.RDFHandlerException;
43  import org.slf4j.Logger;
44  import org.slf4j.LoggerFactory;
45  
46  import eu.fbk.rdfpro.util.IO;
47  import eu.fbk.rdfpro.util.Namespaces;
48  import eu.fbk.rdfpro.util.Sorter;
49  import eu.fbk.rdfpro.util.Sorter.Input;
50  import eu.fbk.rdfpro.util.Sorter.Output;
51  import eu.fbk.rdfpro.util.Statements;
52  import eu.fbk.rdfpro.vocab.VOID;
53  import eu.fbk.rdfpro.vocab.VOIDX;
54  
55  final class ProcessorStats implements RDFProcessor {
56  
57      private static final Logger LOGGER = LoggerFactory.getLogger(ProcessorStats.class);
58  
59      @Nullable
60      private final String outputNamespace;
61  
62      @Nullable
63      private final URI sourceProperty;
64  
65      @Nullable
66      private final URI sourceContext;
67  
68      private final boolean processCooccurrences;
69  
70      private final long threshold;
71  
72      ProcessorStats(@Nullable final String outputNamespace, @Nullable final URI sourceProperty,
73              @Nullable final URI sourceContext, @Nullable final Long threshold,
74              final boolean processCooccurrences) {
75          this.outputNamespace = outputNamespace;
76          this.sourceProperty = sourceProperty;
77          this.sourceContext = sourceContext;
78          this.processCooccurrences = processCooccurrences;
79          this.threshold = threshold != null ? threshold : 0;
80      }
81  
82      @Override
83      public RDFHandler wrap(final RDFHandler handler) {
84          return new Handler(Objects.requireNonNull(handler));
85      }
86  
87      private final class Handler extends AbstractRDFHandler {
88  
89          private final RDFHandler handler;
90  
91          private final List<SourceStats> sourceList;
92  
93          private final Map<URI, SourceStats> sourceMap;
94  
95          private final ConcurrentHashMap<URI, URI> sourceInterner;
96  
97          private final List<TypeStats> typeList;
98  
99          private final Map<URI, TypeStats> typeMap;
100 
101         private final List<PropertyStats> propertyList;
102 
103         private final Map<URI, PropertyStats> propertyMap;
104 
105         private final List<Context> contextList;
106 
107         private final Map<Hash, Context> contextMap;
108 
109         private final Map<URI, TypeStats.Sampler> samplerMap;
110 
111         private final Set<String> mintedURIs;
112 
113         private Hash directBlockSubject;
114 
115         private final Map<SourceStats, PartialStats> directBlockStats;
116 
117         private final Set<PropertyStats.Partition> directBlockPartitions;
118 
119         private Hash inverseBlockObject;
120 
121         private long inverseBlockVersion;
122 
123         private Sorter<Record> sorter;
124 
125         private boolean firstPass;
126 
127         Handler(final RDFHandler handler) {
128             this.handler = handler;
129             this.sourceList = new ArrayList<SourceStats>();
130             this.sourceMap = new HashMap<URI, SourceStats>();
131             this.sourceInterner = new ConcurrentHashMap<URI, URI>();
132             this.typeList = new ArrayList<TypeStats>();
133             this.typeMap = new HashMap<URI, TypeStats>();
134             this.propertyList = new ArrayList<PropertyStats>();
135             this.propertyMap = new HashMap<URI, PropertyStats>();
136             this.contextList = new ArrayList<Context>();
137             this.contextMap = new HashMap<Hash, Context>();
138             this.samplerMap = new HashMap<URI, TypeStats.Sampler>();
139             this.directBlockSubject = null;
140             this.directBlockStats = new HashMap<SourceStats, PartialStats>();
141             this.directBlockPartitions = new HashSet<PropertyStats.Partition>();
142             this.inverseBlockObject = null;
143             this.inverseBlockVersion = 0L;
144             this.mintedURIs = new HashSet<String>();
145             this.sorter = null;
146             this.firstPass = true;
147 
148             PropertyStats ps = new PropertyStats(RDF.TYPE, 0);
149             this.propertyMap.put(RDF.TYPE, ps);
150             this.propertyList.add(ps);
151         }
152 
153         @Override
154         public void startRDF() throws RDFHandlerException {
155             this.handler.startRDF();
156             this.mintedURIs.clear();
157             if (this.firstPass) {
158                 this.sorter = new Sorter<Record>() {
159 
160                     @Override
161                     protected void encode(final Output writer, final Record record)
162                             throws IOException {
163                         record.write(writer);
164                     }
165 
166                     @Override
167                     protected Record decode(final Input reader) throws IOException {
168                         return Record.read(reader);
169                     }
170 
171                 };
172                 try {
173                     this.sorter.start(true);
174                 } catch (final IOException ex) {
175                     throw new RDFHandlerException(ex);
176                 }
177             }
178         }
179 
180         @Override
181         public void handleStatement(final Statement statement) throws RDFHandlerException {
182 
183             if (!this.firstPass) {
184                 return;
185             }
186 
187             final Resource s = statement.getSubject();
188             final URI p = statement.getPredicate();
189             final Value o = statement.getObject();
190             final Resource c = statement.getContext();
191 
192             final boolean isURIType = o instanceof URI && p.equals(RDF.TYPE);
193             final Hash sh = Hash.create(s);
194             final Hash oh = isURIType ? null : Hash.create(o);
195 
196             PropertyStats ps;
197             synchronized (this.propertyList) {
198                 ps = this.propertyMap.get(p);
199                 if (ps == null) {
200                     ps = new PropertyStats(p, this.propertyList.size());
201                     this.propertyMap.put(p, ps);
202                     this.propertyList.add(ps);
203                 }
204             }
205 
206             TypeStats ts = null;
207             if (isURIType) {
208                 synchronized (this.typeList) {
209                     ts = this.typeMap.get(o);
210                     if (ts == null) {
211                         ts = new TypeStats((URI) o, this.typeList.size());
212                         this.typeMap.put((URI) o, ts);
213                         this.typeList.add(ts);
214                     }
215                 }
216             }
217 
218             Context ctx = null;
219             if (c != null) {
220                 final Hash ch = Hash.create(c);
221                 synchronized (this.contextList) {
222                     ctx = this.contextMap.get(ch);
223                     if (ctx == null) {
224                         ctx = new Context(this.contextList.size());
225                         this.contextMap.put(ch, ctx);
226                         this.contextList.add(ctx);
227                     }
228                     ctx.used = true;
229                 }
230             }
231 
232             if (o instanceof URI
233                     && p.equals(ProcessorStats.this.sourceProperty)
234                     && (ProcessorStats.this.sourceContext == null || Objects.equals(c,
235                             ProcessorStats.this.sourceContext))) {
236                 URI source = this.sourceInterner.putIfAbsent((URI) o, (URI) o);
237                 source = source != null ? source : (URI) o;
238                 Context sctx;
239                 synchronized (this.contextList) {
240                     sctx = this.contextMap.get(sh);
241                     if (sctx == null) {
242                         sctx = new Context(this.contextList.size());
243                         this.contextMap.put(sh, sctx);
244                         this.contextList.add(sctx);
245                     }
246                 }
247                 synchronized (sctx) {
248                     if (!Arrays.asList(sctx.sources).contains(source)) {
249                         final URI[] array = new URI[sctx.sources.length + 1];
250                         System.arraycopy(sctx.sources, 0, array, 0, sctx.sources.length);
251                         array[array.length - 1] = source;
252                         sctx.sources = array;
253                     }
254                 }
255             }
256 
257             final int pi = ps.index;
258             final int ti = ts == null ? -1 : ts.index;
259             final int ci = ctx == null ? -1 : ctx.index;
260 
261             final Record direct = Record.create(false, sh, pi, ti, oh, ci);
262             final Record inverse = isURIType ? null : Record.create(true, null, pi, ti, oh, ci);
263 
264             try {
265                 this.sorter.emit(direct);
266                 if (inverse != null) {
267                     this.sorter.emit(inverse);
268                 }
269             } catch (final Throwable ex) {
270                 throw new RDFHandlerException(ex);
271             }
272 
273             synchronized (ps) {
274                 if (ps.sampler == null) {
275                     ps.sampler = new PropertyStats.Sampler();
276                 }
277                 ps.sampler.add(statement);
278             }
279 
280             if (s instanceof URI) {
281                 synchronized (this.samplerMap) {
282                     TypeStats.Sampler sampler = this.samplerMap.get(s);
283                     if (sampler != null) {
284                         sampler.add(statement);
285                         if (ts != null && ts.sampler == null) {
286                             ts.sampler = sampler;
287                         }
288                     } else if (ts != null && ts.sampler == null) {
289                         sampler = new TypeStats.Sampler();
290                         sampler.add(statement);
291                         ts.sampler = sampler;
292                         this.samplerMap.put((URI) s, sampler);
293                     }
294                 }
295             }
296         }
297 
298         @Override
299         public void endRDF() throws RDFHandlerException {
300             if (this.firstPass) {
301                 try {
302                     this.typeMap.clear(); // no more used
303                     this.propertyMap.clear(); // no more used
304                     this.contextMap.clear(); // no more used
305                     this.samplerMap.clear(); // no more used
306                     this.sourceInterner.clear(); // no more used
307 
308                     final SourceStats s0 = new SourceStats(null, 0);
309                     this.sourceMap.put(null, s0);
310                     this.sourceList.add(s0);
311 
312                     for (int i = 0; i < this.contextList.size(); ++i) {
313                         final Context ctx = this.contextList.get(i);
314                         if (!ctx.used) {
315                             this.contextList.set(i, null);
316                         } else {
317                             for (final URI source : ctx.sources) {
318                                 SourceStats ss = this.sourceMap.get(source);
319                                 if (ss == null) {
320                                     ss = new SourceStats(source, this.sourceList.size());
321                                     this.sourceMap.put(source, ss);
322                                     this.sourceList.add(ss);
323                                 }
324                             }
325                         }
326                     }
327 
328                     for (final TypeStats ts : this.typeList) {
329                         ts.partitions = new TypeStats.Partition[this.sourceList.size()];
330                         ts.partitions[0] = new TypeStats.Partition();
331                         if (ts.sampler != null) {
332                             ts.example = ts.sampler.build();
333                             ts.sampler = null; // release memory
334                         }
335                     }
336 
337                     for (final PropertyStats ps : this.propertyList) {
338                         ps.partitions = new PropertyStats.Partition[this.sourceList.size()];
339                         ps.partitions[0] = new PropertyStats.Partition();
340                         if (ps.sampler != null) {
341                             ps.example = ps.sampler.build();
342                             ps.sampler = null; // release memory
343                         }
344                     }
345 
346                     LOGGER.debug("Status: {} properties, {} types, {} contexts, " + "{} sources",
347                             this.propertyList.size(), this.typeList.size(),
348                             this.contextMap.size(), this.sourceList.size());
349 
350                     this.sorter.end(false, new Consumer<Record>() {
351 
352                         @Override
353                         public void accept(final Record record) {
354                             if (record.inverse) {
355                                 handleInverseRecord(record);
356                             } else {
357                                 handleDirectRecord(record);
358                             }
359                         }
360 
361                     });
362                     this.sorter = null;
363                     handleDirectRecord(null); // flush last direct block
364 
365                 } catch (final IOException ex) {
366                     throw new RDFHandlerException(ex);
367                 }
368             }
369             emitStatistics();
370             this.handler.endRDF();
371             this.firstPass = false;
372         }
373 
374         @Override
375         public void close() {
376             IO.closeQuietly(this.sorter);
377             IO.closeQuietly(this.handler);
378         }
379 
380         private void handleDirectRecord(@Nullable final Record record) {
381 
382             if (record == null || !record.subject.equals(this.directBlockSubject)) {
383                 for (final Map.Entry<SourceStats, PartialStats> e : this.directBlockStats
384                         .entrySet()) {
385                     final int index = e.getKey().index;
386                     final PartialStats s = e.getValue();
387                     if (s.tss != null) {
388                         for (final TypeStats ts : s.tss) {
389                             TypeStats.Partition tp = ts.partitions[index];
390                             if (tp == null) {
391                                 tp = new TypeStats.Partition();
392                                 ts.partitions[index] = tp;
393                             }
394                             tp.triples += s.triples;
395                             tp.tboxTriples += s.tboxTriples;
396                             tp.aboxTriples += s.aboxTriples;
397                             tp.typeTriples += s.typeTriples;
398                             tp.sameAsTriples += s.sameAsTriples;
399                             tp.predicates += s.pss == null ? 0 : s.pss.size();
400                             tp.entities += s.entities;
401                             if (ProcessorStats.this.processCooccurrences) {
402                                 tp.types = tp.types != null ? tp.types : new BitSet();
403                                 tp.properties = tp.properties != null ? tp.properties
404                                         : new BitSet();
405                                 if (s.types != null) {
406                                     tp.types.or(s.types);
407                                 }
408                                 if (s.properties != null) {
409                                     tp.properties.or(s.properties);
410                                 }
411                             }
412                         }
413                     }
414                 }
415                 this.directBlockStats.clear();
416                 this.directBlockPartitions.clear();
417                 if (record == null) {
418                     return;
419                 }
420                 this.directBlockSubject = record.subject;
421             }
422 
423             if (record.object != null) {
424                 final boolean isLiteral = record.object.isLiteral();
425                 final PropertyStats ps = this.propertyList.get(record.property);
426                 if (ps.detectedType == null) {
427                     ps.detectedType = isLiteral ? OWL.DATATYPEPROPERTY : OWL.OBJECTPROPERTY;
428                 } else if (ps.detectedType == OWL.DATATYPEPROPERTY && !isLiteral
429                         || ps.detectedType == OWL.OBJECTPROPERTY && isLiteral) {
430                     ps.detectedType = RDF.PROPERTY;
431                 }
432             }
433 
434             handleDirectRecordHelper(record, this.sourceList.get(0));
435             if (record.context >= 0) {
436                 final Context ctx = this.contextList.get(record.context);
437                 for (final URI source : ctx.sources) {
438                     handleDirectRecordHelper(record, this.sourceMap.get(source));
439                 }
440             }
441         }
442 
443         private void handleDirectRecordHelper(final Record record, final SourceStats ss) {
444 
445             final boolean isEntity = record.subject.isURI();
446 
447             PartialStats s = this.directBlockStats.get(ss);
448             if (s == null) {
449                 s = new PartialStats();
450                 this.directBlockStats.put(ss, s);
451                 if (isEntity) {
452                     ++ss.entities;
453                     ++s.entities;
454                 }
455                 if (ProcessorStats.this.processCooccurrences) {
456                     ss.types = ss.types != null ? ss.types : new BitSet();
457                     ss.properties = ss.properties != null ? ss.properties : new BitSet();
458                     s.types = new BitSet();
459                     s.properties = new BitSet();
460                 }
461             }
462 
463             ++ss.triples;
464             ++s.triples;
465 
466             if (record.type >= 0) {
467                 final TypeStats ts = this.typeList.get(record.type);
468                 s.tss = s.tss != null ? s.tss : new HashSet<TypeStats>();
469                 s.tss.add(ts);
470                 if (Statements.TBOX_CLASSES.contains(ts.type)) {
471                     ++ss.tboxTriples;
472                     ++s.tboxTriples;
473                 } else {
474                     ++ss.aboxTriples;
475                     ++s.aboxTriples;
476                     ++ss.typeTriples;
477                     ++s.typeTriples;
478                 }
479                 if (ProcessorStats.this.processCooccurrences) {
480                     ss.types.set(ts.index);
481                     s.types.set(ts.index);
482                 }
483             }
484 
485             final PropertyStats ps = this.propertyList.get(record.property);
486             s.pss = s.pss != null ? s.pss : new HashSet<PropertyStats>();
487             s.pss.add(ps);
488             PropertyStats.Partition pp = ps.partitions[ss.index];
489             if (pp == null) {
490                 pp = new PropertyStats.Partition();
491                 ps.partitions[ss.index] = pp;
492             }
493             ++pp.triples;
494             if (this.directBlockPartitions.add(pp)) {
495                 ++pp.distinctSubjects;
496                 pp.entities += isEntity ? 1 : 0;
497             }
498 
499             if (record.type < 0) {
500                 if (Statements.TBOX_PROPERTIES.contains(ps.property)) {
501                     ++ss.tboxTriples;
502                     ++s.tboxTriples;
503                 } else {
504                     ++ss.aboxTriples;
505                     ++s.aboxTriples;
506                     if (ps.property.equals(OWL.SAMEAS)) {
507                         ++ss.sameAsTriples;
508                         ++s.sameAsTriples;
509                     }
510                 }
511                 if (ProcessorStats.this.processCooccurrences) {
512                     ss.properties.set(ps.index);
513                     s.properties.set(ps.index);
514                 }
515             }
516         }
517 
518         private void handleInverseRecord(final Record record) {
519             if (!record.object.equals(this.inverseBlockObject)) {
520                 ++this.inverseBlockVersion;
521                 this.inverseBlockObject = record.object;
522             }
523             final PropertyStats ps = this.propertyList.get(record.property);
524             final PropertyStats.Partition p0 = ps.partitions[0];
525             if (p0.version < this.inverseBlockVersion) {
526                 p0.version = this.inverseBlockVersion;
527                 ++p0.distinctObjects;
528             }
529             if (record.context >= 0) {
530                 final Context ctx = this.contextList.get(record.context);
531                 for (final URI source : ctx.sources) {
532                     final SourceStats ss = this.sourceMap.get(source); // TODO avoid sourceMap
533                     PropertyStats.Partition p = ps.partitions[ss.index];
534                     if (p == null) {
535                         p = new PropertyStats.Partition();
536                         ps.partitions[ss.index] = p;
537                     } else if (p.version == this.inverseBlockVersion) {
538                         continue;
539                     }
540                     ++p.distinctObjects;
541                 }
542             }
543         }
544 
545         private void emitStatistics() throws RDFHandlerException {
546 
547             this.handler.handleNamespace(VOID.PREFIX, VOID.NAMESPACE);
548             this.handler.handleNamespace(VOIDX.PREFIX, VOIDX.NAMESPACE);
549 
550             final Map<URI, URI> spURIs = new HashMap<URI, URI>();
551             for (final SourceStats s : this.sourceList) {
552                 final URI uri = mintURI(s.source != null ? s.source : VOID.DATASET);
553                 final String label = Statements.formatValue(uri, Namespaces.DEFAULT) + " ("
554                         + s.entities + ", " + s.triples + ")";
555                 spURIs.put(s.source, uri);
556                 emit(uri, RDF.TYPE, VOID.DATASET);
557                 emit(uri, VOIDX.LABEL, label);
558                 emit(uri, VOIDX.SOURCE, s.source);
559                 emit(uri, VOID.ENTITIES, s.entities);
560                 emit(uri, VOID.TRIPLES, s.triples);
561                 emit(uri, VOIDX.TBOX_TRIPLES, s.tboxTriples);
562                 emit(uri, VOIDX.ABOX_TRIPLES, s.aboxTriples);
563                 emit(uri, VOIDX.TYPE_TRIPLES, s.typeTriples);
564                 emit(uri, VOIDX.SAME_AS_TRIPLES, s.sameAsTriples);
565                 if (s.types != null) {
566                     emit(uri, VOID.CLASSES, s.types.cardinality());
567                 }
568                 if (s.properties != null) {
569                     emit(uri, VOID.PROPERTIES, s.properties.cardinality());
570                 }
571             }
572 
573             for (final TypeStats ts : this.typeList) {
574                 final TypeStats.Partition p0 = ts.partitions[0];
575                 if (p0.entities < ProcessorStats.this.threshold) {
576                     continue;
577                 }
578                 final String label = Statements.formatValue(ts.type, Namespaces.DEFAULT) + " ("
579                         + p0.entities + ")";
580                 emit(ts.type, VOIDX.LABEL, label);
581                 if (ts.example != null) {
582                     emit(ts.type, VOIDX.EXAMPLE, ts.example);
583                 }
584                 for (int i = 0; i < ts.partitions.length; ++i) {
585                     final TypeStats.Partition p = ts.partitions[i];
586                     if (p != null && p.entities >= ProcessorStats.this.threshold) {
587                         final URI source = this.sourceList.get(i).source;
588                         final URI spURI = spURIs.get(source);
589                         final URI tpURI = mintURI(source, ts.type);
590                         final String tpLabel = Statements.formatValue(tpURI, Namespaces.DEFAULT)
591                                 + " (" + p.entities + ", C)";
592                         emit(ts.type, p == p0 ? VOIDX.GLOBAL_STATS : VOIDX.SOURCE_STATS, tpURI);
593                         emit(spURI, VOID.CLASS_PARTITION, tpURI);
594                         emit(tpURI, RDF.TYPE, VOID.DATASET);
595                         emit(tpURI, VOIDX.LABEL, tpLabel);
596                         emit(tpURI, VOIDX.SOURCE, source);
597                         emit(tpURI, VOID.CLASS, ts.type);
598                         emit(tpURI, VOID.ENTITIES, p.entities);
599                         emit(tpURI, VOID.TRIPLES, p.triples);
600                         emit(tpURI, VOIDX.TBOX_TRIPLES, p.tboxTriples);
601                         emit(tpURI, VOIDX.ABOX_TRIPLES, p.aboxTriples);
602                         emit(tpURI, VOIDX.TYPE_TRIPLES, p.typeTriples);
603                         emit(tpURI, VOIDX.SAME_AS_TRIPLES, p.sameAsTriples);
604                         if (p.types != null) {
605                             emit(tpURI, VOID.CLASSES, p.types.cardinality());
606                         }
607                         if (p.properties != null) {
608                             emit(tpURI, VOID.PROPERTIES, p.properties.cardinality());
609                         }
610                         if (p.entities > 0) {
611                             emit(tpURI, VOIDX.AVERAGE_PROPERTIES, (double) p.predicates
612                                     / p.entities);
613                         }
614                     }
615                 }
616             }
617 
618             for (final PropertyStats ps : this.propertyList) {
619                 final PropertyStats.Partition p0 = ps.partitions[0];
620                 if (p0.triples < ProcessorStats.this.threshold) {
621                     continue;
622                 }
623                 final boolean isTBox = Statements.TBOX_PROPERTIES.contains(ps.property);
624                 final boolean isType = ps.property.equals(RDF.TYPE);
625                 final boolean isSameAs = ps.property.equals(OWL.SAMEAS);
626                 final boolean fun = p0.triples > 0 && p0.triples == p0.distinctSubjects;
627                 final boolean invfun = p0.triples > 0 && p0.triples == p0.distinctObjects;
628                 final boolean data = OWL.DATATYPEPROPERTY.equals(ps.detectedType);
629                 final boolean object = OWL.OBJECTPROPERTY.equals(ps.detectedType);
630                 final String label = String.format("%s (%d, %s%s%s)", Statements.formatValue(
631                         ps.property, Namespaces.DEFAULT), p0.triples, data ? "D" : object ? "O"
632                         : "P", fun ? "F" : "", invfun ? "I" : "");
633                 emit(ps.property, VOIDX.LABEL, label);
634                 emit(ps.property, VOIDX.TYPE, ps.detectedType);
635                 if (fun) {
636                     emit(ps.property, VOIDX.TYPE, OWL.FUNCTIONALPROPERTY);
637                 }
638                 if (invfun) {
639                     emit(ps.property, VOIDX.TYPE, OWL.INVERSEFUNCTIONALPROPERTY);
640                 }
641                 if (ps.example != null) {
642                     emit(ps.property, VOIDX.EXAMPLE, ps.example);
643                 }
644                 for (int i = 0; i < ps.partitions.length; ++i) {
645                     final PropertyStats.Partition p = ps.partitions[i];
646                     if (p != null && p.triples >= ProcessorStats.this.threshold) {
647                         final URI source = this.sourceList.get(i).source;
648                         final URI spURI = spURIs.get(source);
649                         final URI ppURI = mintURI(source, ps.property);
650                         final boolean ppFun = p.triples > 0 && p.triples == p.distinctSubjects;
651                         final boolean ppInvfun = p.triples > 0 && p.triples == p.distinctObjects;
652                         final String ppLabel = String.format("%s (%d, %s%s%s)", Statements
653                                 .formatValue(ppURI, Namespaces.DEFAULT), p.triples, data ? "D"
654                                 : object ? "O" : "P", ppFun ? "F" : "", ppInvfun ? "I" : "");
655                         emit(ps.property, p == p0 ? VOIDX.GLOBAL_STATS : VOIDX.SOURCE_STATS, ppURI);
656                         emit(spURI, VOID.PROPERTY_PARTITION, ppURI);
657                         emit(ppURI, RDF.TYPE, VOID.DATASET);
658                         emit(ppURI, VOIDX.LABEL, ppLabel);
659                         emit(ppURI, VOIDX.SOURCE, source);
660                         emit(ppURI, VOID.PROPERTY, ps.property);
661                         emit(ppURI, VOID.CLASSES, 0);
662                         emit(ppURI, VOID.PROPERTIES, 1);
663                         emit(ppURI, VOID.ENTITIES, p.entities);
664                         emit(ppURI, VOID.TRIPLES, p.triples);
665                         emit(ppURI, VOIDX.TBOX_TRIPLES, isTBox ? p.triples : 0);
666                         emit(ppURI, VOIDX.ABOX_TRIPLES, isTBox ? 0 : p.triples);
667                         emit(ppURI, VOIDX.TYPE_TRIPLES, isType ? p.triples : 0);
668                         emit(ppURI, VOIDX.SAME_AS_TRIPLES, isSameAs ? p.triples : 0);
669                         emit(ppURI, VOID.DISTINCT_SUBJECTS, p.distinctSubjects);
670                         emit(ppURI, VOID.DISTINCT_OBJECTS, p.distinctObjects);
671                     }
672                 }
673             }
674 
675             for (final URI term : VOID.TERMS) {
676                 emit(term, VOIDX.LABEL, Statements.formatValue(term, Namespaces.DEFAULT));
677             }
678             for (final URI term : VOIDX.TERMS) {
679                 emit(term, VOIDX.LABEL, Statements.formatValue(term, Namespaces.DEFAULT));
680             }
681         }
682 
683         private void emit(@Nullable final Resource subject, @Nullable final URI predicate,
684                 @Nullable final Object object) throws RDFHandlerException {
685 
686             Value value = null;
687 
688             if (subject != null && predicate != null) {
689                 if (object instanceof Value) {
690                     value = (Value) object;
691                 } else if (object instanceof Integer && ((Integer) object).intValue() != 0) {
692                     value = Statements.VALUE_FACTORY.createLiteral((Integer) object);
693                 } else if (object instanceof Long && ((Long) object).longValue() != 0L) {
694                     value = Statements.VALUE_FACTORY.createLiteral((Long) object);
695                 } else if (object instanceof Double && ((Double) object).doubleValue() != 0.0) {
696                     value = Statements.VALUE_FACTORY.createLiteral((Double) object);
697                 } else if (object instanceof String && !((String) object).isEmpty()) {
698                     value = Statements.VALUE_FACTORY.createLiteral((String) object,
699                             XMLSchema.STRING);
700                 }
701             }
702 
703             if (value != null) {
704                 this.handler.handleStatement(Statements.VALUE_FACTORY.createStatement(subject,
705                         predicate, value));
706             }
707         }
708 
709         private URI mintURI(final URI... inputURIs) {
710             final StringBuilder builder = new StringBuilder();
711             if (ProcessorStats.this.outputNamespace != null) {
712                 builder.append(ProcessorStats.this.outputNamespace);
713             } else {
714                 builder.append("stats:");
715             }
716             boolean started = false;
717             for (final URI uri : inputURIs) {
718                 if (uri != null) {
719                     if (started) {
720                         builder.append("_");
721                     }
722                     started = true;
723                     builder.append(uri.getLocalName());
724                 }
725             }
726             final String base = builder.toString();
727             for (int i = 0; i < 1000; ++i) {
728                 final String candidate = i == 0 ? base : base + "_" + i;
729                 if (this.mintedURIs.add(candidate)) {
730                     return Statements.VALUE_FACTORY.createURI(candidate);
731                 }
732             }
733             throw new Error();
734         }
735 
736     }
737 
738     private static final class PartialStats {
739 
740         @Nullable
741         Set<TypeStats> tss;
742 
743         @Nullable
744         Set<PropertyStats> pss;
745 
746         @Nullable
747         BitSet types;
748 
749         @Nullable
750         BitSet properties;
751 
752         long entities;
753 
754         long triples;
755 
756         long tboxTriples;
757 
758         long aboxTriples;
759 
760         long typeTriples;
761 
762         long sameAsTriples;
763 
764     }
765 
766     private static final class SourceStats {
767 
768         @Nullable
769         final URI source;
770 
771         final int index;
772 
773         @Nullable
774         BitSet types;
775 
776         @Nullable
777         BitSet properties;
778 
779         long entities;
780 
781         long triples;
782 
783         long tboxTriples;
784 
785         long aboxTriples;
786 
787         long typeTriples;
788 
789         long sameAsTriples;
790 
791         SourceStats(final URI source, final int index) {
792             this.source = source;
793             this.index = index;
794             this.types = null;
795             this.properties = null;
796             this.entities = 0;
797             this.triples = 0;
798             this.tboxTriples = 0;
799             this.aboxTriples = 0;
800             this.typeTriples = 0;
801             this.sameAsTriples = 0;
802         }
803 
804     }
805 
806     private static final class TypeStats {
807 
808         @Nullable
809         final URI type;
810 
811         final int index;
812 
813         @Nullable
814         Sampler sampler;
815 
816         @Nullable
817         String example;
818 
819         @Nullable
820         Partition[] partitions;
821 
822         TypeStats(@Nullable final URI type, final int index) {
823             this.type = type;
824             this.index = index;
825         }
826 
827         static final class Partition {
828 
829             BitSet types;
830 
831             BitSet properties;
832 
833             long entities;
834 
835             long triples;
836 
837             long tboxTriples;
838 
839             long aboxTriples;
840 
841             long typeTriples;
842 
843             long sameAsTriples;
844 
845             long predicates;
846 
847         }
848 
849         static class Sampler {
850 
851             private static final int MAX_VALUE_LENGTH = 40;
852 
853             private static final int MAX_STATEMENTS = 20;
854 
855             private URI id;
856 
857             private final List<Value> data;
858 
859             Sampler() {
860                 this.data = new ArrayList<Value>();
861             }
862 
863             synchronized void add(final Statement statement) {
864                 if (this.data.size() < MAX_STATEMENTS * 2) {
865                     this.id = (URI) statement.getSubject();
866                     this.data.add(statement.getPredicate());
867                     this.data.add(statement.getObject());
868                 }
869             }
870 
871             synchronized String build() {
872                 final List<String> lines = new ArrayList<String>();
873                 for (int i = 0; i < this.data.size(); i += 2) {
874                     final String predicate = Statements.formatValue(this.data.get(i),
875                             Namespaces.DEFAULT);
876                     final String object = Statements.formatValue(
877                             Statements.shortenValue(this.data.get(i + 1), MAX_VALUE_LENGTH),
878                             Namespaces.DEFAULT);
879                     lines.add(predicate + " " + object);
880                 }
881                 Collections.sort(lines);
882                 final StringBuilder builder = new StringBuilder(Statements.formatValue(this.id,
883                         Namespaces.DEFAULT));
884                 for (int i = 0; i < lines.size(); ++i) {
885                     builder.append("\n    ").append(lines.get(i));
886                     builder.append(i < lines.size() - 1 ? ';' : '.');
887                 }
888                 return builder.toString();
889             }
890 
891         }
892 
893     }
894 
895     private static final class PropertyStats {
896 
897         @Nullable
898         final URI property;
899 
900         final int index;
901 
902         @Nullable
903         Sampler sampler;
904 
905         @Nullable
906         String example;
907 
908         @Nullable
909         URI detectedType;
910 
911         @Nullable
912         Partition[] partitions;
913 
914         PropertyStats(final URI property, final int index) {
915             this.property = property;
916             this.index = index;
917             this.detectedType = null;
918         }
919 
920         static final class Partition {
921 
922             long entities;
923 
924             long triples;
925 
926             long distinctSubjects;
927 
928             long distinctObjects;
929 
930             long version;
931 
932         }
933 
934         static final class Sampler {
935 
936             private static final int MAX_VALUE_LENGTH = 40;
937 
938             private static final int MAX_STATEMENTS = 3;
939 
940             private final Statement[] statements;
941 
942             private boolean haveBNode;
943 
944             private boolean haveLiteral;
945 
946             private boolean haveURI;
947 
948             private int size;
949 
950             Sampler() {
951                 this.statements = new Statement[MAX_STATEMENTS];
952                 this.haveBNode = false;
953                 this.haveLiteral = false;
954                 this.haveURI = false;
955                 this.size = 0;
956             }
957 
958             synchronized void add(final Statement statement) {
959 
960                 final Resource s = statement.getSubject();
961                 final Value o = statement.getObject();
962                 final boolean isURI = o instanceof URI;
963                 final boolean isBNode = o instanceof BNode;
964                 final boolean isLiteral = o instanceof Literal;
965 
966                 if (!(s instanceof URI)
967                         || this.size == this.statements.length
968                         && (isURI && this.haveURI || isBNode && this.haveBNode || isLiteral
969                                 && this.haveLiteral)) {
970                     return;
971                 }
972 
973                 int index = -1;
974                 for (int i = 0; i < this.statements.length; ++i) {
975                     final Statement stmt = this.statements[i];
976                     if (stmt == null) {
977                         index = i;
978                         ++this.size;
979                         break;
980                     } else if (stmt.equals(statement)) {
981                         return;
982                     } else if (!this.haveURI && isURI //
983                             || !this.haveBNode && isBNode //
984                             || !this.haveLiteral && isLiteral) {
985                         index = i;
986                         break;
987                     }
988                 }
989                 if (index >= 0) {
990                     this.statements[index] = statement;
991                     this.haveURI |= isURI;
992                     this.haveBNode |= isBNode;
993                     this.haveLiteral |= isLiteral;
994                 }
995             }
996 
997             synchronized String build() {
998                 final StringBuilder builder = new StringBuilder();
999                 for (final Statement statement : this.statements) {
1000                     if (statement != null) {
1001                         builder.append("\n    ")
1002                                 .append(Statements.formatValue(statement.getSubject(),
1003                                         Namespaces.DEFAULT))
1004                                 .append(" ")
1005                                 .append(Statements.formatValue(statement.getPredicate(),
1006                                         Namespaces.DEFAULT))
1007                                 .append(" ")
1008                                 .append(Statements.formatValue(Statements.shortenValue(
1009                                         statement.getObject(), MAX_VALUE_LENGTH),
1010                                         Namespaces.DEFAULT)).append(" .");
1011                     }
1012                 }
1013                 return builder.toString();
1014             }
1015 
1016         }
1017 
1018     }
1019 
1020     private static final class Context {
1021 
1022         private static final URI[] EMPTY = new URI[0];
1023 
1024         final int index;
1025 
1026         URI[] sources;
1027 
1028         boolean used;
1029 
1030         Context(final int index) {
1031             this.index = index;
1032             this.sources = EMPTY;
1033             this.used = false;
1034         }
1035 
1036     }
1037 
1038     private static final class Record {
1039 
1040         final boolean inverse;
1041 
1042         @Nullable
1043         final Hash subject;
1044 
1045         final int property;
1046 
1047         final int type;
1048 
1049         @Nullable
1050         final Hash object;
1051 
1052         final int context;
1053 
1054         private Record(final boolean inverse, final Hash subject, final int predicate,
1055                 final int type, final Hash object, final int context) {
1056             assert !inverse || object != null;
1057             assert inverse || subject != null;
1058             this.inverse = inverse;
1059             this.subject = subject;
1060             this.property = predicate;
1061             this.type = type;
1062             this.object = object;
1063             this.context = context;
1064 
1065         }
1066 
1067         public static Record create(final boolean inverse, @Nullable final Hash subject,
1068                 final int predicate, final int type, @Nullable final Hash object,
1069                 @Nullable final int context) {
1070             return new Record(inverse, subject, predicate, type, object, context);
1071         }
1072 
1073         public static Record read(final Input reader) throws IOException {
1074 
1075             final Hash hash = Hash.read(reader);
1076 
1077             boolean inverse = false;
1078             Hash subject = null;
1079             Hash object = null;
1080             int predicate = -1;
1081             int type = -1;
1082             int context = -1;
1083 
1084             final int c = (int) reader.readNumber();
1085             final boolean hasContext = (c & 0x01) != 0;
1086             final int format = c & 0xE;
1087 
1088             if (format == 8) {
1089                 // o p c
1090                 inverse = true;
1091                 object = hash;
1092                 predicate = (int) reader.readNumber();
1093             } else if (format == 4) {
1094                 // s t c
1095                 subject = hash;
1096                 type = (int) reader.readNumber();
1097                 predicate = 0; // explicit mapping of rdf:type to 0
1098             } else if (format == 2) {
1099                 // s p o c
1100                 subject = hash;
1101                 predicate = (int) reader.readNumber();
1102                 object = Hash.read(reader);
1103             } else {
1104                 throw new Error("format is " + format);
1105             }
1106 
1107             if (hasContext) {
1108                 context = (int) reader.readNumber();
1109             }
1110 
1111             return create(inverse, subject, predicate, type, object, context);
1112         }
1113 
1114         public void write(final Output writer) throws IOException {
1115 
1116             final int flag = this.context >= 0 ? 1 : 0;
1117 
1118             if (this.inverse) {
1119                 // o p c -> hash(o) byte(flag) num(p) num(c)
1120                 this.object.write(writer);
1121                 writer.writeNumber(flag + 8);
1122                 writer.writeNumber(this.property);
1123             } else if (this.object == null) {
1124                 // s t c -> hash(s) char(flag) 4*char(type, 127 each) hash(c)
1125                 this.subject.write(writer);
1126                 writer.writeNumber(flag + 4);
1127                 writer.writeNumber(this.type);
1128             } else {
1129                 // s p o c -> hash(s) char(flag) 4*char(p) hash(o) hash(c)
1130                 this.subject.write(writer);
1131                 writer.writeNumber(flag + 2);
1132                 writer.writeNumber(this.property);
1133                 this.object.write(writer);
1134             }
1135 
1136             if (this.context >= 0) {
1137                 writer.writeNumber(this.context);
1138             }
1139         }
1140 
1141     }
1142 
1143     // TODO: revise following class to better use eu.fbk.rdfpro.util.Hash
1144 
1145     private static final class Hash {
1146 
1147         private static final int MAX_LENGTH = 4 * 1024;
1148 
1149         private static final int TABLE_SIZE = 4 * 1024 - 1;
1150 
1151         private static final Hash[] TABLE_HASHES = new Hash[TABLE_SIZE];
1152 
1153         private static final Value[] TABLE_VALUES = new Value[TABLE_SIZE];
1154 
1155         private static final Index<URI> DATATYPE_INDEX = new Index<URI>(1024);
1156 
1157         private static final Index<String> LANGUAGE_INDEX = new Index<String>(1024);
1158 
1159         private final long lo;
1160 
1161         private final long hi;
1162 
1163         public static Hash read(final Input reader) throws IOException {
1164             final long lo = reader.readNumber();
1165             final long hi = reader.readNumber();
1166             return new Hash(lo, hi);
1167         }
1168 
1169         public static Hash create(final Value value) {
1170             if (value.stringValue().length() > MAX_LENGTH) {
1171                 return compute(value);
1172             }
1173             final int index = (value.hashCode() & 0x7FFFFFFF) % TABLE_SIZE;
1174             synchronized (TABLE_VALUES) {
1175                 if (value.equals(TABLE_VALUES[index])) {
1176                     return TABLE_HASHES[index];
1177                 }
1178             }
1179             final Hash hash = compute(value);
1180             synchronized (TABLE_VALUES) {
1181                 TABLE_VALUES[index] = value;
1182                 TABLE_HASHES[index] = hash;
1183             }
1184             return hash;
1185         }
1186 
1187         private static Hash compute(final Value value) {
1188 
1189             final String string = value.stringValue();
1190 
1191             boolean doHash = true;
1192             long lo = 0;
1193             long hi = 0;
1194 
1195             final int length = string.length();
1196             if (length <= 15) {
1197                 doHash = false;
1198                 long cur = 0;
1199                 for (int i = 0; i < 16; ++i) {
1200                     int c = 1;
1201                     if (i < length) {
1202                         c = string.charAt(i);
1203                         if (c <= 0 || c >= 128) {
1204                             doHash = true;
1205                             break;
1206                         }
1207                     }
1208                     cur = cur << 8 | c;
1209                     if (i == 7) {
1210                         lo = cur;
1211                         cur = 0;
1212                     }
1213                 }
1214                 hi = cur;
1215             }
1216 
1217             if (doHash) {
1218                 final eu.fbk.rdfpro.util.Hash hash = eu.fbk.rdfpro.util.Hash.murmur3(string);
1219                 lo = hash.getLow();
1220                 hi = hash.getHigh();
1221             }
1222 
1223             lo = (lo & 0x7F7F7F7F7F7F7F7FL) + 0x0101010101010101L;
1224             lo = lo & 0x7F7F7F7F7F7F7F7FL | (lo & 0x8080808080808080L) >> 1;
1225             hi = (hi & 0x7F7F7F7F7F7F7F7FL) + 0x0101010101010101L;
1226             hi = hi & 0x7F7F7F7F7F7F7F7FL | (hi & 0x8080808080808080L) >> 1;
1227             hi = hi & 0x0FFFFFFFFFFFFFFFL | 0x4000000000000000L;
1228 
1229             if (value instanceof URI) {
1230                 hi = hi | 0x3000000000000000L;
1231             } else if (value instanceof BNode) {
1232                 hi = hi | 0x2000000000000000L;
1233             } else if (value instanceof Literal) {
1234                 hi = hi | 0x1000000000000000L;
1235                 final Literal literal = (Literal) value;
1236                 int index = 0;
1237                 if (literal.getLanguage() != null) {
1238                     index = LANGUAGE_INDEX.put(literal.getLanguage()) | 0x40000000;
1239                 } else if (literal.getDatatype() != null) {
1240                     index = DATATYPE_INDEX.put(literal.getDatatype());
1241                 }
1242                 index = index & 0x7FFFFFFF;
1243                 lo = (lo ^ index) & 0xFFFFFFFF7F7F7F7FL;
1244                 if ((lo & 0xFFL) == 0L) {
1245                     lo = lo | 0x01L;
1246                 }
1247                 if ((lo & 0xFF00L) == 0L) {
1248                     lo = lo | 0x0100L;
1249                 }
1250                 if ((lo & 0xFF0000L) == 0L) {
1251                     lo = lo | 0x010000L;
1252                 }
1253                 if ((lo & 0xFF000000L) == 0L) {
1254                     lo = lo | 0x01000000L;
1255                 }
1256             }
1257 
1258             return new Hash(lo, hi);
1259         }
1260 
1261         private Hash(final long lo, final long hi) {
1262             this.lo = lo;
1263             this.hi = hi;
1264         }
1265 
1266         public boolean isURI() {
1267             return (this.hi & 0x3000000000000000L) == 0x3000000000000000L;
1268         }
1269 
1270         public boolean isLiteral() {
1271             return (this.hi & 0x3000000000000000L) == 0x1000000000000000L;
1272         }
1273 
1274         @Override
1275         public boolean equals(final Object object) {
1276             if (object == this) {
1277                 return true;
1278             }
1279             if (!(object instanceof Hash)) {
1280                 return false;
1281             }
1282             final Hash other = (Hash) object;
1283             return this.lo == other.lo && this.hi == other.hi;
1284         }
1285 
1286         @Override
1287         public int hashCode() {
1288             final int hh = (int) (this.hi >> 32);
1289             final int hl = (int) this.hi;
1290             final int lh = (int) (this.lo >> 32);
1291             final int ll = (int) this.lo;
1292             return ((hh * 37 + hl) * 37 + lh) * 37 + ll;
1293         }
1294 
1295         public void write(final Output writer) throws IOException {
1296             writer.writeNumber(this.lo);
1297             writer.writeNumber(this.hi);
1298         }
1299 
1300     }
1301 
1302     private static final class Index<T> {
1303 
1304         private final Map<T, Integer> map;
1305 
1306         private final List<T> list;
1307 
1308         private final int size;
1309 
1310         Index(final int size) {
1311             final int capacity = Math.min(size, 1024);
1312             this.map = new HashMap<T, Integer>(capacity);
1313             this.list = new ArrayList<T>(capacity);
1314             this.size = size;
1315         }
1316 
1317         @Nullable
1318         synchronized Integer put(final T element) {
1319             Integer index = this.map.get(element);
1320             if (index == null && this.list.size() < this.size) {
1321                 index = this.list.size() + 1;
1322                 this.list.add(element);
1323                 this.map.put(element, index);
1324             }
1325             return index;
1326         }
1327 
1328         @Nullable
1329         synchronized T get(final int index) {
1330             return this.list.get(index - 1);
1331         }
1332 
1333     }
1334 
1335 }