1   /*
2    * RDFpro - An extensible tool for building stream-oriented RDF processing libraries.
3    * 
4    * Written in 2015 by Francesco Corcoglioniti with support by Alessio Palmero Aprosio and Marco
5    * Rospocher. Contact info on http://rdfpro.fbk.eu/
6    * 
7    * To the extent possible under law, the authors have dedicated all copyright and related and
8    * neighboring rights to this software to the public domain worldwide. This software is
9    * distributed without any warranty.
10   * 
11   * You should have received a copy of the CC0 Public Domain Dedication along with this software.
12   * If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
13   */
14  package eu.fbk.rdfpro.util;
15  
16  import java.math.BigDecimal;
17  import java.math.BigInteger;
18  import java.util.Arrays;
19  import java.util.concurrent.atomic.AtomicLong;
20  
21  import javax.annotation.Nullable;
22  import javax.xml.datatype.DatatypeConstants;
23  import javax.xml.datatype.XMLGregorianCalendar;
24  
25  import org.openrdf.model.BNode;
26  import org.openrdf.model.Literal;
27  import org.openrdf.model.Resource;
28  import org.openrdf.model.Statement;
29  import org.openrdf.model.URI;
30  import org.openrdf.model.Value;
31  import org.openrdf.model.ValueFactory;
32  import org.openrdf.model.vocabulary.XMLSchema;
33  import org.openrdf.rio.RDFHandler;
34  import org.openrdf.rio.RDFHandlerException;
35  import org.slf4j.Logger;
36  import org.slf4j.LoggerFactory;
37  
38  import eu.fbk.rdfpro.AbstractRDFHandler;
39  
40  // codes from 1 to 0xEFFFFFFF denote indexed values (15 GB indexable)
41  // - from 1 to 0xBFFFFFFF primary buffer can be used for values long less than 128 bytes (12 GB)
42  // - from 0xC0000000 to 0xEFFFFFFF secondary buffer always used (3 GB, 384M values encodable)
43  // codes from 0xF0000000 to 0xFFFFFFFF denote embedded values
44  
45  abstract class Dictionary implements AutoCloseable {
46  
47      private static final Logger LOGGER = LoggerFactory.getLogger(Dictionary.class);
48  
49      private static final int[] PACKING_TYPES;
50  
51      private static final SequentialDictionary<URI> INITIAL_DATATYPE_INDEX;
52  
53      private static final int XSD_STRING_INDEX;
54  
55      static final URI XSD_STRING = Statements.normalize(XMLSchema.STRING);
56  
57      static final int NULL_CODE = 0xE0000000;
58  
59      static final int TYPE_URI_FULL = 0;
60  
61      static final int TYPE_URI_NAME = 1;
62  
63      static final int TYPE_BNODE = 2;
64  
65      static final int TYPE_LITERAL_PLAIN = 4;
66  
67      static final int TYPE_LITERAL_LANG = 5;
68  
69      static final int TYPE_LITERAL_DT = 6;
70  
71      final static int PACKING_BIGDECIMAL = 1;
72  
73      final static int PACKING_BIGINTEGER = 2;
74  
75      final static int PACKING_LONG = 3;
76  
77      final static int PACKING_DOUBLE = 4;
78  
79      final static int PACKING_BOOLEAN = 5;
80  
81      final static int PACKING_DATETIME = 6;
82  
83      static {
84          final int[] t = new int[26]; // 25 + first slot unused
85          final SequentialDictionary<URI> i = new SequentialDictionary<>(64 * 1024 - 2);
86          t[i.encode(Statements.normalize(XMLSchema.DECIMAL))] = PACKING_BIGDECIMAL;
87          t[i.encode(Statements.normalize(XMLSchema.INTEGER))] = PACKING_BIGINTEGER;
88          t[i.encode(Statements.normalize(XMLSchema.NON_POSITIVE_INTEGER))] = PACKING_BIGINTEGER;
89          t[i.encode(Statements.normalize(XMLSchema.NEGATIVE_INTEGER))] = PACKING_BIGINTEGER;
90          t[i.encode(Statements.normalize(XMLSchema.NON_NEGATIVE_INTEGER))] = PACKING_BIGINTEGER;
91          t[i.encode(Statements.normalize(XMLSchema.POSITIVE_INTEGER))] = PACKING_BIGINTEGER;
92          t[i.encode(Statements.normalize(XMLSchema.LONG))] = PACKING_LONG;
93          t[i.encode(Statements.normalize(XMLSchema.INT))] = PACKING_LONG;
94          t[i.encode(Statements.normalize(XMLSchema.SHORT))] = PACKING_LONG;
95          t[i.encode(Statements.normalize(XMLSchema.BYTE))] = PACKING_LONG;
96          t[i.encode(Statements.normalize(XMLSchema.UNSIGNED_LONG))] = PACKING_LONG;
97          t[i.encode(Statements.normalize(XMLSchema.UNSIGNED_INT))] = PACKING_LONG;
98          t[i.encode(Statements.normalize(XMLSchema.UNSIGNED_SHORT))] = PACKING_LONG;
99          t[i.encode(Statements.normalize(XMLSchema.UNSIGNED_BYTE))] = PACKING_LONG;
100         t[i.encode(Statements.normalize(XMLSchema.DOUBLE))] = PACKING_DOUBLE;
101         t[i.encode(Statements.normalize(XMLSchema.FLOAT))] = PACKING_DOUBLE;
102         t[i.encode(Statements.normalize(XMLSchema.BOOLEAN))] = PACKING_BOOLEAN;
103         t[i.encode(Statements.normalize(XMLSchema.DATETIME))] = PACKING_DATETIME;
104         t[i.encode(Statements.normalize(XMLSchema.DATE))] = PACKING_DATETIME;
105         t[i.encode(Statements.normalize(XMLSchema.TIME))] = PACKING_DATETIME;
106         t[i.encode(Statements.normalize(XMLSchema.GYEARMONTH))] = PACKING_DATETIME;
107         t[i.encode(Statements.normalize(XMLSchema.GMONTHDAY))] = PACKING_DATETIME;
108         t[i.encode(Statements.normalize(XMLSchema.GYEAR))] = PACKING_DATETIME;
109         t[i.encode(Statements.normalize(XMLSchema.GMONTH))] = PACKING_DATETIME;
110         t[i.encode(Statements.normalize(XMLSchema.GDAY))] = PACKING_DATETIME;
111 
112         PACKING_TYPES = t;
113         INITIAL_DATATYPE_INDEX = i;
114         XSD_STRING_INDEX = i.encode(Statements.normalize(XMLSchema.STRING));
115 
116         // Non packable types:
117         // XMLSchema.DURATION XMLSchema.DAYTIMEDURATION XMLSchema.STRING XMLSchema.BASE64BINARY
118         // XMLSchema.HEXBINARY XMLSchema.ANYURI XMLSchema.QNAME XMLSchema.NOTATION
119         // XMLSchema.NORMALIZEDSTRING XMLSchema.TOKEN XMLSchema.LANGUAGE XMLSchema.NMTOKEN
120         // XMLSchema.NMTOKENS XMLSchema.NAME XMLSchema.NCNAME XMLSchema.ID XMLSchema.IDREF
121         // XMLSchema.IDREFS XMLSchema.ENTITY XMLSchema.ENTITIES
122     }
123 
124     private final SequentialDictionary<String> namespaceIndex;
125 
126     private final SequentialDictionary<String> languageIndex;
127 
128     private final SequentialDictionary<URI> datatypeIndex;
129 
130     private final EncodeCacheEntry[] encodeCache;
131 
132     private final DecodeCacheEntry[] decodeCache;
133 
134     private final AtomicLong encodeEmbeddedCounter;
135 
136     private final AtomicLong encodeCacheCounter;
137 
138     private final AtomicLong encodeIndexedCounter;
139 
140     private final AtomicLong decodeEmbeddedCounter;
141 
142     private final AtomicLong decodeCacheCounter;
143 
144     private final AtomicLong decodeIndexedCounter;
145 
146     Dictionary() {
147         this.namespaceIndex = new SequentialDictionary<>(64 * 1024 - 2);
148         this.languageIndex = new SequentialDictionary<>(64 * 1024 - 2);
149         this.datatypeIndex = new SequentialDictionary<>(INITIAL_DATATYPE_INDEX);
150         this.encodeCache = new EncodeCacheEntry[1024 - 1];
151         this.decodeCache = new DecodeCacheEntry[1024 - 1];
152         this.encodeEmbeddedCounter = new AtomicLong();
153         this.encodeCacheCounter = new AtomicLong();
154         this.encodeIndexedCounter = new AtomicLong();
155         this.decodeEmbeddedCounter = new AtomicLong();
156         this.decodeCacheCounter = new AtomicLong();
157         this.decodeIndexedCounter = new AtomicLong();
158     }
159 
160     abstract int doEncode(int type, int index, String string);
161 
162     abstract Value doDecode(int code);
163 
164     int doType(final int code) {
165         final Value value = decode(code);
166         if (value instanceof BNode) {
167             return TYPE_BNODE;
168         } else if (value instanceof URI) {
169             return ((URI) value).getLocalName().isEmpty() ? TYPE_URI_FULL : TYPE_URI_NAME;
170         } else {
171             final Literal lit = (Literal) value;
172             return lit.getLanguage() != null ? TYPE_LITERAL_LANG : lit.getDatatype() != null
173                     && !lit.getDatatype().equals(XSD_STRING) ? TYPE_LITERAL_DT
174                     : TYPE_LITERAL_PLAIN;
175         }
176     }
177 
178     void doToString(final StringBuilder builder) {
179     }
180 
181     void doClose() {
182     }
183 
184     final Value valueFor(final int type, final int index, final String string) {
185 
186         final ValueFactory vf = Statements.VALUE_FACTORY;
187         switch (type) {
188         case TYPE_URI_FULL:
189             return vf.createURI(string);
190         case TYPE_URI_NAME:
191             return vf.createURI(this.namespaceIndex.decode(index), string);
192         case TYPE_BNODE:
193             return vf.createBNode(string);
194         case TYPE_LITERAL_PLAIN:
195             return vf.createLiteral(string);
196         case TYPE_LITERAL_DT:
197             return vf.createLiteral(string, this.datatypeIndex.decode(index));
198         case TYPE_LITERAL_LANG:
199             return vf.createLiteral(string, this.languageIndex.decode(index));
200         default:
201             throw new Error();
202         }
203     }
204 
205     public static Dictionary newMemoryDictionary() {
206         return new MemoryDictionary();
207     }
208 
209     public final RDFHandler encode(final QuadHandler sink) {
210         return new AbstractRDFHandler() {
211 
212             @Override
213             public void startRDF() throws RDFHandlerException {
214                 sink.start();
215             }
216 
217             @Override
218             public void handleStatement(final Statement stmt) throws RDFHandlerException {
219                 final int subj = encode(stmt.getSubject());
220                 final int pred = encode(stmt.getPredicate());
221                 final int obj = encode(stmt.getObject());
222                 final int ctx = encode(stmt.getContext());
223                 sink.handle(subj, pred, obj, ctx);
224             }
225 
226             @Override
227             public void endRDF() throws RDFHandlerException {
228                 sink.end();
229             }
230 
231             @Override
232             public void close() {
233                 sink.close();
234             }
235 
236         };
237     }
238 
239     void encodeCachePut(@Nullable final Value value, final int code) {
240         final int hash = value.hashCode() & 0x7FFFFFFF;
241         final int slotIndex = hash % this.encodeCache.length;
242         final EncodeCacheEntry entry = new EncodeCacheEntry(code, hash, value);
243         this.encodeCache[slotIndex] = entry;
244     }
245 
246     public final int encode(@Nullable final Value value) {
247 
248         // Handle default context
249         if (value == null) {
250             return NULL_CODE;
251         }
252 
253         // Handle literals whose value can be fully packed in the code
254         int index = 0;
255         if (value instanceof Literal) {
256             final Literal l = (Literal) value;
257             if (l.getLanguage() == null) {
258                 index = this.datatypeIndex.encode(l.getDatatype());
259                 if (index < PACKING_TYPES.length) {
260                     final int v = pack(PACKING_TYPES[index], l);
261                     if (v >= 0) {
262                         this.encodeEmbeddedCounter.incrementAndGet();
263                         return 0xE0000000 | index << 24 | v;
264                     }
265                 }
266             }
267         }
268 
269         // Lookup a pre-computed code in the cache
270         final int hash = value.hashCode();
271         final int slotIndex = (hash & 0x7FFFFFFF) % this.encodeCache.length;
272         final EncodeCacheEntry entry = this.encodeCache[slotIndex];
273         if (entry != null && entry.hash == hash && entry.value.equals(value)) {
274             this.encodeCacheCounter.incrementAndGet();
275             return entry.code;
276         }
277 
278         // On cache miss, split the value in type + index + string (datatype index reused)
279         int type;
280         String string;
281         if (value instanceof Literal) {
282             final Literal l = (Literal) value;
283             string = l.getLabel();
284             if (l.getLanguage() != null) {
285                 type = TYPE_LITERAL_LANG;
286                 index = this.languageIndex.encode(l.getLanguage());
287             } else if (index != XSD_STRING_INDEX) {
288                 type = TYPE_LITERAL_DT;
289             } else {
290                 type = TYPE_LITERAL_PLAIN;
291                 index = 0;
292             }
293         } else if (value instanceof URI) {
294             final URI u = (URI) value;
295             index = this.namespaceIndex.encode(u.getNamespace(), 0);
296             if (index > 0) {
297                 type = TYPE_URI_NAME;
298                 string = u.getLocalName();
299             } else {
300                 type = TYPE_URI_FULL;
301                 string = u.stringValue();
302             }
303         } else if (value instanceof BNode) {
304             type = TYPE_BNODE;
305             string = ((BNode) value).getID();
306         } else {
307             throw new Error();
308         }
309 
310         // Delegate and cache the result
311         final int code = doEncode(type, index, string);
312         this.encodeCache[slotIndex] = new EncodeCacheEntry(code, hash, value);
313         this.encodeIndexedCounter.incrementAndGet();
314         return code;
315     }
316 
317     public final QuadHandler decode(final RDFHandler sink) {
318         return new QuadHandler() {
319 
320             @Override
321             public void start() throws RDFHandlerException {
322                 sink.startRDF();
323             }
324 
325             @Override
326             public void handle(final int subj, final int pred, final int obj, final int ctx)
327                     throws RDFHandlerException {
328                 final Resource s = (Resource) decode(subj);
329                 final URI p = (URI) decode(pred);
330                 final Value o = decode(obj);
331                 final Resource c = (Resource) decode(ctx);
332                 sink.handleStatement(Statements.VALUE_FACTORY.createStatement(s, p, o, c));
333             }
334 
335             @Override
336             public void end() throws RDFHandlerException {
337                 sink.endRDF();
338             }
339 
340             @Override
341             public void close() {
342                 IO.closeQuietly(sink);
343             }
344 
345         };
346     }
347 
348     @Nullable
349     public final Value decode(final int code) {
350 
351         // Handle default context
352         if (code == NULL_CODE) {
353             return null;
354         }
355 
356         // Handle packed values
357         if (code >>> 29 == 0x7) {
358             final int index = (code & 0x1F000000) >>> 24;
359             final URI datatype = this.datatypeIndex.decode(index);
360             final String label = unpack(PACKING_TYPES[index], code & 0xFFFFFF);
361             final Value value = Statements.VALUE_FACTORY.createLiteral(label, datatype);
362             this.decodeEmbeddedCounter.incrementAndGet();
363             return value;
364         }
365 
366         // Lookup the code in the cache
367         final int hash = hashInt(code);
368         final int slotIndex = (hash & 0x7FFFFFFF) % this.decodeCache.length;
369         final DecodeCacheEntry entry = this.decodeCache[slotIndex];
370         if (entry != null && entry.code == code) {
371             this.decodeCacheCounter.incrementAndGet();
372             return entry.value;
373         }
374 
375         // Delegate and cache the result
376         final Value value = doDecode(code);
377         this.decodeCache[slotIndex] = new DecodeCacheEntry(code, value);
378         this.decodeIndexedCounter.incrementAndGet();
379         return value;
380     }
381 
382     public final boolean isNull(final int code) {
383         return code == NULL_CODE;
384     }
385 
386     public final boolean isResource(final int code) {
387         return doType(code) <= TYPE_BNODE;
388     }
389 
390     public final boolean isURI(final int code) {
391         return doType(code) <= TYPE_URI_NAME;
392     }
393 
394     public final boolean isBNode(final int code) {
395         return doType(code) == TYPE_BNODE;
396     }
397 
398     public final boolean isLiteral(final int code) {
399         return doType(code) >= TYPE_LITERAL_PLAIN;
400     }
401 
402     @Override
403     public final void close() {
404         doClose();
405     }
406 
407     @Override
408     public final String toString() {
409         final StringBuilder builder = new StringBuilder(getClass().getSimpleName()).append(": ");
410         final int oldLength = builder.length();
411         doToString(builder);
412         builder.append(builder.length() > oldLength ? ", " : "");
413         builder.append(this.namespaceIndex.size()).append(" namespaces, ")
414                 .append(this.datatypeIndex.size()).append(" datatypes, ")
415                 .append(this.languageIndex.size()).append(" languages, ");
416         toStringHelper(builder, "encode", this.encodeEmbeddedCounter.get(),
417                 this.encodeCacheCounter.get(), this.encodeIndexedCounter.get());
418         builder.append(", ");
419         toStringHelper(builder, "decode", this.decodeEmbeddedCounter.get(),
420                 this.decodeCacheCounter.get(), this.decodeIndexedCounter.get());
421         return builder.toString();
422     }
423 
424     private void toStringHelper(final StringBuilder builder, final String type,
425             final long embedded, final long cached, final long indexed) {
426         final long total = embedded + cached + indexed;
427         builder.append(total).append(" ").append(type).append(" calls (").append(embedded)
428                 .append(" embedded, ").append(cached).append(" cached, ").append(indexed)
429                 .append(" indexed)");
430     }
431 
432     private static int pack(final int packingType, final Literal literal) {
433 
434         switch (packingType) {
435         case PACKING_BIGDECIMAL:
436             final BigDecimal bd = literal.decimalValue();
437             final int scale = bd.scale();
438             if (scale < 1 << 4 && scale > -(1 << 4)) {
439                 final BigInteger unscaled = bd.unscaledValue();
440                 if (unscaled.bitLength() <= 18) {
441                     return scale + (1 << 4) << 19 | unscaled.intValue() + (1 << 18);
442                 }
443             }
444             break;
445 
446         case PACKING_BIGINTEGER:
447             final BigInteger bi = literal.integerValue();
448             if (bi.bitLength() <= 23) {
449                 return bi.intValue() + (1 << 23);
450             }
451             break;
452 
453         case PACKING_LONG:
454             final long l = literal.longValue();
455             if (l < 1 << 23 && l >= -(1 << 23)) {
456                 return (int) l + (1 << 23);
457             }
458             break;
459 
460         case PACKING_DOUBLE:
461             final long bits = Double.doubleToLongBits(literal.doubleValue());
462             if ((bits & 0xFFFFFFFFFFL) == 0L) {
463                 return (int) (bits >>> 40);
464             }
465             break;
466 
467         case PACKING_BOOLEAN:
468             return literal.booleanValue() ? 1 : 0;
469 
470         case PACKING_DATETIME:
471             final XMLGregorianCalendar c = literal.calendarValue();
472             if (c.getTimezone() == DatatypeConstants.FIELD_UNDEFINED
473                     && c.getMillisecond() == DatatypeConstants.FIELD_UNDEFINED
474                     && c.getSecond() == DatatypeConstants.FIELD_UNDEFINED
475                     && c.getMinute() == DatatypeConstants.FIELD_UNDEFINED
476                     && c.getHour() == DatatypeConstants.FIELD_UNDEFINED) {
477                 final int year = c.getYear();
478                 final int month = c.getMonth();
479                 final int day = c.getDay();
480                 if (year == DatatypeConstants.FIELD_UNDEFINED || year < 1 << 14
481                         && year > -(1 << 14)) {
482                     return (day != DatatypeConstants.FIELD_UNDEFINED ? day : 0)
483                             | (month != DatatypeConstants.FIELD_UNDEFINED ? month : 0) << 5
484                             | (year != DatatypeConstants.FIELD_UNDEFINED ? year + (1 << 14) : 0) << 5 + 4;
485                 }
486             }
487             break;
488         }
489 
490         // Signal failure
491         return -1;
492     }
493 
494     private static String unpack(final int packingType, final int value) {
495 
496         switch (packingType) {
497         case PACKING_BIGDECIMAL:
498             final int scale = (value >> 19 & 0x1F) - (1 << 4);
499             final int unscaled = (value & 0x7FFFF) - (1 << 18);
500             return new BigDecimal(BigInteger.valueOf(unscaled), scale).toString();
501 
502         case PACKING_BIGINTEGER:
503             return Integer.toString((value & 0xFFFFFF) - (1 << 23));
504 
505         case PACKING_LONG:
506             return Integer.toString((value & 0xFFFFFF) - (1 << 23));
507 
508         case PACKING_DOUBLE:
509             return Double.toString(Double.longBitsToDouble((long) (value & 0xFFFFFF) << 40));
510 
511         case PACKING_BOOLEAN:
512             return (value & 0x1) != 0 ? "true" : "false";
513 
514         case PACKING_DATETIME:
515             final XMLGregorianCalendar c = Statements.DATATYPE_FACTORY.newXMLGregorianCalendar();
516             final int year = value >>> 5 + 4 & 0x7FFF;
517             final int month = value >>> 5 & 0x0F;
518             final int day = value & 0x1F;
519             c.setDay(day != 0 ? day : DatatypeConstants.FIELD_UNDEFINED);
520             c.setMonth(month != 0 ? month : DatatypeConstants.FIELD_UNDEFINED);
521             c.setYear(year != 0 ? year - (1 << 14) : DatatypeConstants.FIELD_UNDEFINED);
522             return c.toXMLFormat();
523 
524         default:
525             throw new Error("Unexpected type " + packingType);
526         }
527     }
528 
529     private static int hashInt(final int value) {
530         int x = value;
531         x = (x >>> 16 ^ x) * 0x45d9f3b;
532         x = (x >>> 16 ^ x) * 0x45d9f3b;
533         x = x >>> 16 ^ x;
534         return x;
535     }
536 
537     public interface QuadHandler extends AutoCloseable {
538 
539         final QuadHandler NIL = new QuadHandler() {
540 
541             @Override
542             public void handle(final int subj, final int pred, final int obj, final int ctx) {
543             }
544 
545         };
546 
547         default void start() throws RDFHandlerException {
548         }
549 
550         void handle(int subj, int pred, int obj, int ctx) throws RDFHandlerException;
551 
552         default void end() throws RDFHandlerException {
553         }
554 
555         @Override
556         default void close() {
557         }
558 
559     }
560 
561     private static final class EncodeCacheEntry {
562 
563         final int code;
564 
565         final int hash;
566 
567         final Value value;
568 
569         EncodeCacheEntry(final int code, final int hash, final Value value) {
570             this.code = code;
571             this.hash = hash;
572             this.value = value;
573         }
574 
575     }
576 
577     private static final class DecodeCacheEntry {
578 
579         final int code;
580 
581         final Value value;
582 
583         DecodeCacheEntry(final int code, final Value value) {
584             this.code = code;
585             this.value = value;
586         }
587 
588     }
589 
590     private static final class SequentialDictionary<T> {
591 
592         private final int capacity;
593 
594         private Object[] array;
595 
596         private long[] table;
597 
598         private int size;
599 
600         SequentialDictionary(final int capacity) {
601             this.capacity = capacity;
602             this.array = new Object[Math.min(capacity + 1, 8)];
603             this.table = new long[8];
604             this.size = 0;
605         }
606 
607         SequentialDictionary(final SequentialDictionary<T> index) {
608             this.capacity = index.capacity;
609             this.array = index.array.clone();
610             this.table = index.table.clone();
611             this.size = index.size;
612         }
613 
614         int size() {
615             return this.size;
616         }
617 
618         int encode(final T element, final int resultIfFull) {
619             final long[] table = this.table; // may change on rehash
620             Object[] array = this.array; // may change on rehash
621             final int mask = table.length - 1;
622             final int hash = element.hashCode();
623             for (int slot = hash & mask;; slot = slot + 1 & mask) {
624                 final long cell = table[slot];
625                 if (cell == 0L) {
626                     synchronized (this) {
627                         if (table == this.table && table[slot] == 0L) {
628                             array = this.array; // may have changed in the meanwhile
629                             if (this.size == this.capacity) {
630                                 return resultIfFull;
631                             }
632                             if (this.size > table.length / 3 * 2) {
633                                 rehash(); // enforce load factor < .66
634                             } else {
635                                 final int index = this.size + 1; // Skip index 0
636                                 if (index >= array.length) {
637                                     array = Arrays.copyOf(array, array.length << 1);
638                                     this.array = array;
639                                 }
640                                 array[index] = element;
641                                 table[slot] = (long) index << 32 | hash & 0xFFFFFFFFL;
642                                 ++this.size;
643                                 return index;
644                             }
645                         }
646                     }
647                     return encode(element, resultIfFull); // retry on rehash and concurrent change
648                 }
649                 if ((int) cell == hash) {
650                     final int index = (int) (cell >>> 32);
651                     if (index >= array.length) {
652                         return encode(element, resultIfFull); // array not aligned to table: retry
653                     }
654                     if (element.equals(array[index])) {
655                         return index;
656                     }
657                 }
658             }
659         }
660 
661         int encode(final T element) {
662             final int index = encode(element, -1);
663             if (index < 0) {
664                 throw new IllegalStateException("Full index");
665             }
666             return index;
667         }
668 
669         @SuppressWarnings("unchecked")
670         T decode(final int index) {
671             return (T) this.array[index];
672         }
673 
674         private void rehash() {
675             final long[] oldTable = this.table;
676             final long[] newTable = new long[oldTable.length << 1];
677             final int newMask = newTable.length - 1;
678             for (int oldSlot = 0; oldSlot < oldTable.length; ++oldSlot) {
679                 final long cell = oldTable[oldSlot];
680                 if (cell != 0L) {
681                     final int hash = (int) cell;
682                     int newSlot = hash & newMask;
683                     while (newTable[newSlot] != 0L) {
684                         newSlot = newSlot + 1 & newMask;
685                     }
686                     newTable[newSlot] = cell;
687                 }
688             }
689             this.table = newTable;
690         }
691 
692     }
693 
694     // TODO
695     // - optimize memory layout
696     // - encode bigrams in strings
697     // - rehashing simultaneous to encoding
698 
699     private static final class MemoryDictionary extends Dictionary {
700 
701         private final Buffer primaryBuffer;
702 
703         private final Buffer secondaryBuffer;
704 
705         private long primaryOffset;
706 
707         private long secondaryOffset;
708 
709         private long[] table;
710 
711         private int size;
712 
713         private final Object[] locks;
714 
715         MemoryDictionary() {
716             this.primaryBuffer = Buffer.newResizableBuffer();
717             this.secondaryBuffer = Buffer.newResizableBuffer();
718             this.primaryOffset = 0L;
719             this.secondaryOffset = 0L;
720             this.table = new long[512]; // 4K
721             this.size = 0;
722             this.locks = new Object[Environment.getCores() * 32];
723             for (int i = 0; i < this.locks.length; ++i) {
724                 this.locks[i] = new Object();
725             }
726 
727             // Write dummy byte just to avoid starting at offset 0
728             this.primaryBuffer.write(0, (byte) 0);
729         }
730 
731         int doEncode(final int type, final int index, final String string) {
732 
733             final int hash = type * 6661 + index * 661 + string.hashCode() & 0x7FFFFFFF;
734 
735             final long[] table = this.table; // may concurrently change
736             final int mask = table.length - 1;
737 
738             for (int slot = hash & mask;; slot = slot + 1 & mask) {
739                 final long cell = table[slot];
740                 if (cell == 0L) {
741                     final Buffer buffer = Buffer.newFixedBuffer(new byte[string.length() * 3 + 6]);
742                     final int bufferLength = buffer.writeString(4, string);
743                     synchronized (this) {
744                         if (table == this.table && table[slot] == 0L) {
745                             if (this.size > this.table.length / 3 * 2) {
746                                 rehash(); // enforce load factor < .66
747                             } else {
748                                 final long offset = append(type, index, buffer, 4, bufferLength);
749                                 final int code = offsetToCode(offset);
750                                 table[slot] = (long) code << 32 | hash & 0xFFFFFFFFL;
751                                 ++this.size;
752                                 return code;
753                             }
754                         }
755                     }
756                     return doEncode(type, index, string); // retry on rehash and concurrent change
757                 }
758                 if ((int) cell == hash) {
759                     final long offset = codeToOffset((int) (cell >>> 32));
760                     if (equals(offset, type, index, string)) {
761                         return (int) (offset >>> 2);
762                     }
763                 }
764             }
765         }
766 
767         int doEncode2(final int type, final int index, final String string) {
768 
769             final int hash = type * 6661 + index * 661 + string.hashCode() & 0x7FFFFFFF;
770 
771             long[] table = this.table; // may concurrently change
772             int mask = table.length - 1;
773 
774             for (int slot = hash & mask;; slot = slot + 1 & mask) {
775                 final long cell = table[slot];
776                 if (cell == 0L) {
777                     final Buffer buffer = Buffer.newFixedBuffer(new byte[string.length() * 3]);
778                     final int bufferLength = buffer.writeString(0, string);
779                     boolean proceed = false;
780                     synchronized (this.locks[hash & this.locks.length - 1]) {
781                         synchronized (this) {
782                             if (table == this.table && table[slot] == 0L) {
783                                 if (this.size > this.table.length / 3 * 2) {
784                                     rehash(); // enforce load factor < .66
785                                 } else {
786                                     proceed = true;
787                                 }
788                             }
789                         }
790                         if (proceed) {
791                             final long offset = append2(type, index, buffer, bufferLength);
792                             final int code = offsetToCode(offset);
793                             synchronized (this) {
794                                 if (table != this.table || table[slot] != 0L) {
795                                     table = this.table;
796                                     mask = table.length - 1;
797                                     slot = hash & mask;
798                                     while (table[slot] != 0) {
799                                         slot = slot + 1 & mask;
800                                     }
801                                 }
802                                 table[slot] = (long) code << 32 | hash & 0xFFFFFFFFL;
803                                 ++this.size;
804                             }
805                             return code;
806                         }
807                     }
808                     return doEncode(type, index, string); // retry on rehash and concurrent change
809                 }
810                 if ((int) cell == hash) {
811                     final long offset = codeToOffset((int) (cell >>> 32));
812                     if (equals(offset, type, index, string)) {
813                         return (int) (offset >>> 2);
814                     }
815                 }
816             }
817         }
818 
819         @Override
820         Value doDecode(final int code) {
821 
822             // Extract the offset from the code
823             long offset = codeToOffset(code);
824 
825             // Extract the header stored at the offset
826             final byte header = this.primaryBuffer.read(offset);
827             assert (header & 0x7) == 0;
828 
829             // Extract value type
830             final int type = header >>> 5 & 0x07;
831 
832             String string;
833             int len;
834             final boolean local = (header & 0x10) != 0;
835             if (local) {
836                 final boolean fw = (header & 0x08) != 0;
837                 len = this.primaryBuffer.read(offset + (fw ? 1 : -1)) & 0xFF;
838                 offset += fw ? 2 : 1;
839                 string = this.primaryBuffer.readString(offset, len);
840                 offset += len;
841             } else {
842                 final long off = this.primaryBuffer.readNumber(offset + 1, 5);
843                 len = this.secondaryBuffer.readInt(off);
844                 string = this.secondaryBuffer.readString(off + 4, len);
845                 offset += 6;
846             }
847 
848             int index = 0;
849             if (type == TYPE_URI_NAME || type == TYPE_LITERAL_LANG
850                     || type == Dictionary.TYPE_LITERAL_DT) {
851                 index = this.primaryBuffer.readShort(offset) & 0xFFFF;
852             }
853 
854             return valueFor(type, index, string);
855         }
856 
857         @Override
858         int doType(final int code) {
859             final long offset = (long) code << 2;
860             return this.primaryBuffer.read(offset) >>> 5 & 0x7;
861         }
862 
863         @Override
864         void doToString(final StringBuilder builder) {
865             final long primary = this.primaryOffset;
866             final long secondary = this.secondaryOffset;
867             final long hash = this.table.length * 8;
868             final long total = primary + secondary + hash;
869             builder.append(this.size).append(" values, ").append(total).append(" bytes (")
870                     .append(hash).append(" hash table, ").append(primary)
871                     .append(" primary buffer, ").append(secondary).append(" secondary buffer)");
872         }
873 
874         private void rehash() {
875             // TODO rehashing big table may takes seconds (e.g. 7s for 64M -> 128M entries) and
876             // parallelization provides no benefits. The only solution would be to do rehashing
877             // one block at a time and in the meanwhile continue serving encode() requests hitting
878             // already rehashed blocks.
879             final long ts = System.currentTimeMillis();
880             final long[] oldTable = this.table;
881             final long[] newTable = new long[oldTable.length << 1];
882             final int newMask = newTable.length - 1;
883             for (int oldSlot = 0; oldSlot < oldTable.length; ++oldSlot) {
884                 final long cell = oldTable[oldSlot];
885                 if (cell != 0L) {
886                     final int hash = (int) (cell & 0xFFFFFFFF);
887                     int newSlot = hash & newMask;
888                     while (newTable[newSlot] != 0L) {
889                         newSlot = newSlot + 1 & newMask;
890                     }
891                     newTable[newSlot] = cell;
892                 }
893             }
894             this.table = newTable;
895             LOGGER.debug("Rehashed from {} to {} entries in {} ms", oldTable.length,
896                     newTable.length, System.currentTimeMillis() - ts);
897         }
898 
899         private long append(final int type, final int index, final Buffer buffer, int bufferStart,
900                 int bufferLength) {
901 
902             final long offset = padOffset(this.primaryOffset);
903             final boolean local = bufferLength < 128 && offset <= 1L << 33;
904             final int header = type << 5 | (local ? 0x10 : 0);
905 
906             if (local) {
907                 final boolean fw = offset - this.primaryOffset == 0;
908                 buffer.write(bufferStart - (fw ? 2 : 1), (byte) (header | (fw ? 0x08 : 0)));
909                 buffer.write(bufferStart - (fw ? 1 : 2), (byte) bufferLength);
910                 int bufferEnd = bufferStart + bufferLength;
911                 if (index > 0) {
912                     buffer.writeShort(bufferStart + bufferLength, (short) index);
913                     bufferEnd += 2;
914                 }
915                 bufferStart -= 2;
916                 final int writeLength = bufferEnd - bufferStart;
917                 final long writeOffset = offset + (fw ? 0 : -1);
918                 this.primaryBuffer.writeBuffer(writeOffset, buffer, bufferStart, writeLength);
919                 this.primaryOffset = writeOffset + writeLength;
920 
921             } else {
922                 final long pointer = this.secondaryOffset;
923                 buffer.writeInt(bufferStart - 4, bufferLength);
924                 bufferStart -= 4;
925                 bufferLength += 4;
926                 this.secondaryBuffer.writeBuffer(pointer, buffer, bufferStart, bufferLength);
927                 this.secondaryOffset += bufferLength;
928                 if (index > 0) {
929                     final long n = (long) header << 56 | pointer << 16 | index & 0xFFFFL;
930                     this.primaryBuffer.writeLong(offset, n);
931                     this.primaryOffset = offset + 8;
932                 } else {
933                     final long n = (long) header << 40 | pointer;
934                     this.primaryBuffer.writeNumber(offset, 6, n);
935                     this.primaryOffset = offset + 6;
936                 }
937             }
938 
939             return offset;
940         }
941 
942         private long append2(final int type, final int index, final Buffer buffer,
943                 final int bufferLength) {
944 
945             final long offset;
946             final long lastOffset;
947             final long secondaryOffset;
948             final long indexOffset;
949             boolean local;
950 
951             synchronized (this.primaryBuffer) {
952                 lastOffset = this.primaryOffset;
953                 offset = padOffset(lastOffset);
954                 secondaryOffset = this.secondaryOffset;
955                 local = bufferLength <= 128 && offset <= 1L << 33;
956                 if (local) {
957                     final boolean fw = offset - lastOffset == 0;
958                     indexOffset = offset + (fw ? 2 : 1) + bufferLength;
959                 } else {
960                     indexOffset = offset + 6;
961                     this.secondaryOffset += 4 + bufferLength;
962                 }
963                 this.primaryOffset = indexOffset + (index > 0 ? 2 : 0);
964             }
965 
966             int header = type << 5 | (local ? 0x10 : 0);
967 
968             if (local) {
969                 final boolean fw = offset - lastOffset == 0;
970                 final long lenOffset = offset + (fw ? +1 : -1);
971                 final long stringOffset = offset + (fw ? 2 : 1);
972                 this.primaryBuffer.write(lenOffset, (byte) bufferLength);
973                 this.primaryBuffer.writeBuffer(stringOffset, buffer, 0L, bufferLength);
974                 header |= fw ? 0x08 : 0;
975             } else {
976                 this.secondaryBuffer.writeInt(secondaryOffset, bufferLength);
977                 this.secondaryBuffer.writeBuffer(secondaryOffset + 4, buffer, 0L, bufferLength);
978                 this.secondaryOffset = secondaryOffset + 4 + bufferLength;
979                 this.primaryBuffer.writeNumber(offset + 1, 5, secondaryOffset);
980             }
981 
982             this.primaryBuffer.write(offset, (byte) header);
983 
984             if (index > 0) {
985                 this.primaryBuffer.writeShort(indexOffset, (short) index);
986             }
987 
988             return offset;
989         }
990 
991         private boolean equals(long offset, final int type, final int index, final String string) {
992 
993             // Extract the header
994             final byte header = this.primaryBuffer.read(offset);
995             assert (header & 0x7) == 0;
996 
997             // Check stored type matches supplied type
998             if ((header >>> 5 & 0x07) != type) {
999                 return false;
1000             }
1001 
1002             // Extract buffer, offset and length of stored string (do not read it yet)
1003             // also move offset at the end of the string
1004             int strLen;
1005             long strOffset;
1006             Buffer strBuffer;
1007             final boolean local = (header & 0x10) != 0;
1008             if (local) {
1009                 final boolean fw = (header & 0x08) != 0;
1010                 strLen = this.primaryBuffer.read(offset + (fw ? 1 : -1)) & 0xFF;
1011                 strOffset = offset + (fw ? 2 : 1);
1012                 strBuffer = this.primaryBuffer;
1013                 offset = strOffset + strLen;
1014             } else {
1015                 final long off = this.primaryBuffer.readNumber(offset + 1, 5);
1016                 strLen = this.secondaryBuffer.readInt(off);
1017                 strOffset = off + 4;
1018                 strBuffer = this.secondaryBuffer;
1019                 offset += 6;
1020             }
1021             assert strLen >= 0;
1022             assert strOffset >= 0;
1023 
1024             // Check the index, if applicable
1025             if (type == TYPE_URI_NAME || type == TYPE_LITERAL_LANG
1026                     || type == Dictionary.TYPE_LITERAL_DT) {
1027                 final int storedIndex = this.primaryBuffer.readShort(offset) & 0xFFFF;
1028                 if (index != storedIndex) {
1029                     return false;
1030                 }
1031             }
1032 
1033             // Check the string
1034             return strBuffer.equalString(strOffset, strLen, string);
1035         }
1036 
1037         private static long codeToOffset(final int code) {
1038             // return (long) (code & 0xFFFFFFF) << 2;
1039             return (code & 0x80000000) == 0 ? (long) code << 2 : ((code & 0xFFFFFFFFL) << 3)
1040                     - (1L << 33);
1041         }
1042 
1043         private static int offsetToCode(final long offset) {
1044             // return (int) (offset >>> 2);
1045             return (int) (offset < 1L << 33 ? offset >>> 2 : (1L << 30) + offset >>> 3);
1046         }
1047 
1048         private static long padOffset(final long offset) {
1049             return offset < 1L << 33 ? offset + 3 & 0xFFFFFFFFFFFFFFFCL
1050                     : offset + 7 & 0xFFFFFFFFFFFFFFF8L;
1051         }
1052 
1053     }
1054 
1055 }