1
2
3
4
5
6
7
8
9
10
11
12
13
14 package eu.fbk.rdfpro.util;
15
16 import java.io.IOException;
17 import java.util.ArrayList;
18 import java.util.List;
19
20
21
22
23 final class StringIndex {
24
25 private static final int SMALL_BUFFER_SIZE = 64 * 1024;
26
27 private static final int LARGE_BUFFER_SIZE = SMALL_BUFFER_SIZE * 8;
28
29 private final List<byte[]> smallBuffers;
30
31 private final List<byte[]> largeBuffers;
32
33 private int smallNextID;
34
35 private int largeNextID;
36
37 private int[] table;
38
39 private int size;
40
41 public StringIndex() {
42
43 this.smallBuffers = new ArrayList<>();
44 this.largeBuffers = new ArrayList<>();
45 this.smallNextID = getID(true, 0, 0);
46 this.largeNextID = getID(false, 0, 0);
47 this.table = new int[1022];
48 this.size = 0;
49
50 this.smallBuffers.add(new byte[SMALL_BUFFER_SIZE]);
51 this.largeBuffers.add(new byte[LARGE_BUFFER_SIZE]);
52 }
53
54 public int size() {
55 return this.size;
56 }
57
58 public boolean contains(final String string) {
59 return lookup(string, false) != 0;
60 }
61
62 public int put(final String string) {
63 return lookup(string, true);
64 }
65
66 public String get(final int id) {
67 return get(id, new StringBuilder()).toString();
68 }
69
70 public StringBuilder get(final int id, final StringBuilder out) {
71 try {
72 get(id, (Appendable) out);
73 return out;
74 } catch (final IOException ex) {
75 throw new Error(ex);
76 }
77 }
78
79 public <T extends Appendable> T get(final int id, final T out) throws IOException {
80
81 final boolean small = isSmall(id);
82 final int page = getPage(id);
83 int offset = getOffset(id);
84
85 final byte[] buffer;
86 int length;
87 if (small) {
88 buffer = this.smallBuffers.get(page);
89 length = buffer[offset] & 0xFF;
90 offset++;
91 } else {
92 buffer = this.largeBuffers.get(page);
93 length = getInt(buffer, offset);
94 offset += 4;
95 }
96
97 for (int i = 0; i < length; ++i) {
98 final byte b = buffer[offset++];
99 if (b != 0) {
100 out.append((char) b);
101 } else {
102 out.append(getChar(buffer, offset));
103 offset += 2;
104 }
105 }
106
107 return out;
108 }
109
110 public int length(final int id) {
111 final boolean small = isSmall(id);
112 final int page = getPage(id);
113 final int offset = getOffset(id);
114 return small ? this.smallBuffers.get(page)[offset] & 0xFF
115 : getInt(this.largeBuffers.get(page), offset);
116 }
117
118 public boolean equals(final int id, final String string) {
119 return equals(id, string, 0, string.length());
120 }
121
122 public boolean equals(final int id, final String string, final int startIndex,
123 final int endIndex) {
124
125 final boolean idSmall = isSmall(id);
126 final boolean stringSmall = isSmall(string);
127 if (idSmall != stringSmall) {
128 return false;
129 }
130
131 final int page = getPage(id);
132 int offset = getOffset(id);
133
134 final byte[] buffer;
135 int length;
136 if (idSmall) {
137 buffer = this.smallBuffers.get(page);
138 length = buffer[offset] & 0xFF;
139 offset++;
140 } else {
141 buffer = this.largeBuffers.get(page);
142 length = getInt(buffer, offset);
143 offset += 4;
144 }
145
146 if (length != endIndex - startIndex) {
147 return false;
148 }
149
150 for (int i = startIndex; i < endIndex; ++i) {
151 char c;
152 final byte b = buffer[offset++];
153 if (b != 0) {
154 c = (char) b;
155 } else {
156 c = getChar(buffer, offset);
157 offset += 2;
158 }
159 if (c != string.charAt(i)) {
160 return false;
161 }
162 }
163 return true;
164 }
165
166 private int lookup(final String string, final boolean canAppend) {
167 final int hash = string.hashCode();
168 int slot = (hash & 0x7FFFFFFF) % (this.table.length / 2) * 2;
169 while (true) {
170 int id = this.table[slot];
171 if (id == 0) {
172 if (!canAppend) {
173 return 0;
174 } else if (this.size > this.table.length * 2 / 5) {
175 rehash();
176 return lookup(string, canAppend);
177 }
178 id = append(string);
179 this.table[slot] = id;
180 this.table[slot + 1] = hash;
181 ++this.size;
182 return id;
183 }
184 if (this.table[slot + 1] == hash && equals(id, string)) {
185 return id;
186 }
187 slot += 2;
188 if (slot >= this.table.length) {
189 slot = 0;
190 }
191 }
192 }
193
194 private void rehash() {
195 final int newSize = this.table.length + 1;
196 final int[] newTable = new int[2 * newSize];
197 for (int oldOffset = 0; oldOffset < this.table.length; oldOffset += 2) {
198 final int id = this.table[oldOffset];
199 if (id != 0) {
200 final int hash = this.table[oldOffset + 1];
201 int newOffset = (hash & 0x7FFFFFFF) % newSize * 2;
202 while (newTable[newOffset] != 0) {
203 newOffset += 2;
204 if (newOffset >= newTable.length) {
205 newOffset = 0;
206 }
207 }
208 newTable[newOffset] = id;
209 newTable[newOffset + 1] = hash;
210 }
211 }
212 this.table = newTable;
213 }
214
215 private int append(final String string) {
216
217 final boolean small = isSmall(string);
218 final int length = string.length();
219
220 byte[] buffer;
221 int offset;
222 final int id;
223 if (small) {
224 offset = getOffset(this.smallNextID);
225 if (offset + 1 + length * 3 >= SMALL_BUFFER_SIZE) {
226 this.smallBuffers.add(new byte[SMALL_BUFFER_SIZE]);
227 this.smallNextID = getID(true, this.smallBuffers.size() - 1, 0);
228 offset = 0;
229 }
230 id = this.smallNextID;
231 buffer = this.smallBuffers.get(this.smallBuffers.size() - 1);
232 buffer[offset++] = (byte) length;
233 } else {
234 offset = getOffset(this.largeNextID);
235 if (offset + 4 + length * 3 >= LARGE_BUFFER_SIZE) {
236 this.largeBuffers.add(new byte[LARGE_BUFFER_SIZE]);
237 this.largeNextID = getID(false, this.largeBuffers.size() - 1, 0);
238 offset = 0;
239 }
240 id = this.largeNextID;
241 buffer = this.largeBuffers.get(this.largeBuffers.size() - 1);
242 putInt(buffer, offset, length);
243 offset += 4;
244 }
245
246 for (int i = 0; i < length; ++i) {
247 final char ch = string.charAt(i);
248 if (ch > 0 && ch <= 127) {
249 buffer[offset++] = (byte) ch;
250 } else {
251 buffer[offset++] = (byte) 0;
252 putChar(buffer, offset, ch);
253 offset += 2;
254 }
255 }
256
257 if (small) {
258 this.smallNextID = getID(small, this.smallBuffers.size() - 1, offset);
259 } else {
260 this.largeNextID = getID(small, this.largeBuffers.size() - 1, offset);
261 }
262
263 return id;
264 }
265
266 private int getID(final boolean small, final int page, final int offset) {
267 return small ? (page + 1 & 0x7FFF) << 16 | offset & 0xFFFF
268 : 0x80000000 | (page + 1 & 0x7FFF) << 16 | offset + 7 >> 3 & 0xFFFF;
269 }
270
271 private int getPage(final int id) {
272 return (id >> 16 & 0x7FFF) - 1;
273 }
274
275 private int getOffset(final int id) {
276 return (id & 0x80000000) == 0 ? id & 0xFFFF : (id & 0xFFFF) << 3;
277 }
278
279 private boolean isSmall(final int id) {
280 return (id & 0x80000000) == 0;
281 }
282
283 private boolean isSmall(final String string) {
284 return string.length() <= 255;
285 }
286
287 private static char getChar(final byte[] buffer, final int offset) {
288 final int byte0 = buffer[offset] & 0xFF;
289 final int byte1 = buffer[offset + 1] & 0xFF;
290 return (char) (byte0 << 8 | byte1);
291 }
292
293 private static void putChar(final byte[] buffer, final int offset, final char value) {
294 buffer[offset] = (byte) (value >>> 8);
295 buffer[offset + 1] = (byte) value;
296 }
297
298 private static int getInt(final byte[] buffer, final int offset) {
299 final int byte0 = buffer[offset] & 0xFF;
300 final int byte1 = buffer[offset + 1] & 0xFF;
301 final int byte2 = buffer[offset + 2] & 0xFF;
302 final int byte3 = buffer[offset + 3] & 0xFF;
303 return byte0 << 24 | byte1 << 16 | byte2 << 8 | byte3;
304 }
305
306 private static void putInt(final byte[] buffer, final int offset, final int value) {
307 buffer[offset] = (byte) (value >>> 24);
308 buffer[offset + 1] = (byte) (value >>> 16);
309 buffer[offset + 2] = (byte) (value >>> 8);
310 buffer[offset + 3] = (byte) value;
311 }
312
313 }