1   /*
2    * RDFpro - An extensible tool for building stream-oriented RDF processing libraries.
3    *
4    * Written in 2015 by Francesco Corcoglioniti with support by Alessio Palmero Aprosio and Marco
5    * Rospocher. Contact info on http://rdfpro.fbk.eu/
6    *
7    * To the extent possible under law, the authors have dedicated all copyright and related and
8    * neighboring rights to this software to the public domain worldwide. This software is
9    * distributed without any warranty.
10   *
11   * You should have received a copy of the CC0 Public Domain Dedication along with this software.
12   * If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
13   */
14  package eu.fbk.rdfpro.util;
15  
16  import java.io.IOException;
17  import java.util.ArrayList;
18  import java.util.List;
19  
20  // TODO
21  // (1) move hash values out of hash table or drop them, revising rehashing code
22  
23  final class StringIndex {
24  
25      private static final int SMALL_BUFFER_SIZE = 64 * 1024;
26  
27      private static final int LARGE_BUFFER_SIZE = SMALL_BUFFER_SIZE * 8;
28  
29      private final List<byte[]> smallBuffers;
30  
31      private final List<byte[]> largeBuffers;
32  
33      private int smallNextID;
34  
35      private int largeNextID;
36  
37      private int[] table;
38  
39      private int size;
40  
41      public StringIndex() {
42  
43          this.smallBuffers = new ArrayList<>();
44          this.largeBuffers = new ArrayList<>();
45          this.smallNextID = getID(true, 0, 0);
46          this.largeNextID = getID(false, 0, 0);
47          this.table = new int[1022]; // 511 entries, ~4K memory page
48          this.size = 0;
49  
50          this.smallBuffers.add(new byte[SMALL_BUFFER_SIZE]);
51          this.largeBuffers.add(new byte[LARGE_BUFFER_SIZE]);
52      }
53  
54      public int size() {
55          return this.size;
56      }
57  
58      public boolean contains(final String string) {
59          return lookup(string, false) != 0;
60      }
61  
62      public int put(final String string) {
63          return lookup(string, true);
64      }
65  
66      public String get(final int id) {
67          return get(id, new StringBuilder()).toString();
68      }
69  
70      public StringBuilder get(final int id, final StringBuilder out) {
71          try {
72              get(id, (Appendable) out);
73              return out;
74          } catch (final IOException ex) {
75              throw new Error(ex);
76          }
77      }
78  
79      public <T extends Appendable> T get(final int id, final T out) throws IOException {
80  
81          final boolean small = isSmall(id);
82          final int page = getPage(id);
83          int offset = getOffset(id);
84  
85          final byte[] buffer;
86          int length;
87          if (small) {
88              buffer = this.smallBuffers.get(page);
89              length = buffer[offset] & 0xFF;
90              offset++;
91          } else {
92              buffer = this.largeBuffers.get(page);
93              length = getInt(buffer, offset);
94              offset += 4;
95          }
96  
97          for (int i = 0; i < length; ++i) {
98              final byte b = buffer[offset++];
99              if (b != 0) {
100                 out.append((char) b);
101             } else {
102                 out.append(getChar(buffer, offset));
103                 offset += 2;
104             }
105         }
106 
107         return out;
108     }
109 
110     public int length(final int id) {
111         final boolean small = isSmall(id);
112         final int page = getPage(id);
113         final int offset = getOffset(id);
114         return small ? this.smallBuffers.get(page)[offset] & 0xFF //
115                 : getInt(this.largeBuffers.get(page), offset);
116     }
117 
118     public boolean equals(final int id, final String string) {
119         return equals(id, string, 0, string.length());
120     }
121 
122     public boolean equals(final int id, final String string, final int startIndex,
123             final int endIndex) {
124 
125         final boolean idSmall = isSmall(id);
126         final boolean stringSmall = isSmall(string);
127         if (idSmall != stringSmall) {
128             return false;
129         }
130 
131         final int page = getPage(id);
132         int offset = getOffset(id);
133 
134         final byte[] buffer;
135         int length;
136         if (idSmall) {
137             buffer = this.smallBuffers.get(page);
138             length = buffer[offset] & 0xFF;
139             offset++;
140         } else {
141             buffer = this.largeBuffers.get(page);
142             length = getInt(buffer, offset);
143             offset += 4;
144         }
145 
146         if (length != endIndex - startIndex) {
147             return false;
148         }
149 
150         for (int i = startIndex; i < endIndex; ++i) {
151             char c;
152             final byte b = buffer[offset++];
153             if (b != 0) {
154                 c = (char) b;
155             } else {
156                 c = getChar(buffer, offset);
157                 offset += 2;
158             }
159             if (c != string.charAt(i)) {
160                 return false;
161             }
162         }
163         return true;
164     }
165 
166     private int lookup(final String string, final boolean canAppend) {
167         final int hash = string.hashCode();
168         int slot = (hash & 0x7FFFFFFF) % (this.table.length / 2) * 2;
169         while (true) {
170             int id = this.table[slot];
171             if (id == 0) {
172                 if (!canAppend) {
173                     return 0;
174                 } else if (this.size > this.table.length * 2 / 5) { // enforce load factor < .8
175                     rehash();
176                     return lookup(string, canAppend); // repeat after rehashing
177                 }
178                 id = append(string);
179                 this.table[slot] = id;
180                 this.table[slot + 1] = hash;
181                 ++this.size;
182                 return id;
183             }
184             if (this.table[slot + 1] == hash && equals(id, string)) {
185                 return id;
186             }
187             slot += 2;
188             if (slot >= this.table.length) {
189                 slot = 0;
190             }
191         }
192     }
193 
194     private void rehash() {
195         final int newSize = this.table.length + 1; // new number of elements
196         final int[] newTable = new int[2 * newSize];
197         for (int oldOffset = 0; oldOffset < this.table.length; oldOffset += 2) {
198             final int id = this.table[oldOffset];
199             if (id != 0) {
200                 final int hash = this.table[oldOffset + 1];
201                 int newOffset = (hash & 0x7FFFFFFF) % newSize * 2;
202                 while (newTable[newOffset] != 0) {
203                     newOffset += 2;
204                     if (newOffset >= newTable.length) {
205                         newOffset = 0;
206                     }
207                 }
208                 newTable[newOffset] = id;
209                 newTable[newOffset + 1] = hash;
210             }
211         }
212         this.table = newTable;
213     }
214 
215     private int append(final String string) {
216 
217         final boolean small = isSmall(string);
218         final int length = string.length();
219 
220         byte[] buffer;
221         int offset;
222         final int id;
223         if (small) {
224             offset = getOffset(this.smallNextID);
225             if (offset + 1 + length * 3 >= SMALL_BUFFER_SIZE) {
226                 this.smallBuffers.add(new byte[SMALL_BUFFER_SIZE]);
227                 this.smallNextID = getID(true, this.smallBuffers.size() - 1, 0);
228                 offset = 0;
229             }
230             id = this.smallNextID;
231             buffer = this.smallBuffers.get(this.smallBuffers.size() - 1);
232             buffer[offset++] = (byte) length;
233         } else {
234             offset = getOffset(this.largeNextID);
235             if (offset + 4 + length * 3 >= LARGE_BUFFER_SIZE) {
236                 this.largeBuffers.add(new byte[LARGE_BUFFER_SIZE]);
237                 this.largeNextID = getID(false, this.largeBuffers.size() - 1, 0);
238                 offset = 0;
239             }
240             id = this.largeNextID;
241             buffer = this.largeBuffers.get(this.largeBuffers.size() - 1);
242             putInt(buffer, offset, length);
243             offset += 4;
244         }
245 
246         for (int i = 0; i < length; ++i) {
247             final char ch = string.charAt(i);
248             if (ch > 0 && ch <= 127) {
249                 buffer[offset++] = (byte) ch;
250             } else {
251                 buffer[offset++] = (byte) 0;
252                 putChar(buffer, offset, ch);
253                 offset += 2;
254             }
255         }
256 
257         if (small) {
258             this.smallNextID = getID(small, this.smallBuffers.size() - 1, offset);
259         } else {
260             this.largeNextID = getID(small, this.largeBuffers.size() - 1, offset);
261         }
262 
263         return id;
264     }
265 
266     private int getID(final boolean small, final int page, final int offset) {
267         return small ? (page + 1 & 0x7FFF) << 16 | offset & 0xFFFF
268                 : 0x80000000 | (page + 1 & 0x7FFF) << 16 | offset + 7 >> 3 & 0xFFFF;
269     }
270 
271     private int getPage(final int id) {
272         return (id >> 16 & 0x7FFF) - 1;
273     }
274 
275     private int getOffset(final int id) {
276         return (id & 0x80000000) == 0 ? id & 0xFFFF : (id & 0xFFFF) << 3;
277     }
278 
279     private boolean isSmall(final int id) {
280         return (id & 0x80000000) == 0;
281     }
282 
283     private boolean isSmall(final String string) {
284         return string.length() <= 255;
285     }
286 
287     private static char getChar(final byte[] buffer, final int offset) {
288         final int byte0 = buffer[offset] & 0xFF;
289         final int byte1 = buffer[offset + 1] & 0xFF;
290         return (char) (byte0 << 8 | byte1);
291     }
292 
293     private static void putChar(final byte[] buffer, final int offset, final char value) {
294         buffer[offset] = (byte) (value >>> 8);
295         buffer[offset + 1] = (byte) value;
296     }
297 
298     private static int getInt(final byte[] buffer, final int offset) {
299         final int byte0 = buffer[offset] & 0xFF;
300         final int byte1 = buffer[offset + 1] & 0xFF;
301         final int byte2 = buffer[offset + 2] & 0xFF;
302         final int byte3 = buffer[offset + 3] & 0xFF;
303         return byte0 << 24 | byte1 << 16 | byte2 << 8 | byte3;
304     }
305 
306     private static void putInt(final byte[] buffer, final int offset, final int value) {
307         buffer[offset] = (byte) (value >>> 24);
308         buffer[offset + 1] = (byte) (value >>> 16);
309         buffer[offset + 2] = (byte) (value >>> 8);
310         buffer[offset + 3] = (byte) value;
311     }
312 
313 }