1   /*
2    * RDFpro - An extensible tool for building stream-oriented RDF processing libraries.
3    * 
4    * Written in 2014 by Francesco Corcoglioniti with support by Marco Amadori, Michele Mostarda,
5    * Alessio Palmero Aprosio and Marco Rospocher. Contact info on http://rdfpro.fbk.eu/
6    * 
7    * To the extent possible under law, the authors have dedicated all copyright and related and
8    * neighboring rights to this software to the public domain worldwide. This software is
9    * distributed without any warranty.
10   * 
11   * You should have received a copy of the CC0 Public Domain Dedication along with this software.
12   * If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
13   */
14  package eu.fbk.rdfpro.tql;
15  
16  import java.io.IOException;
17  import java.io.InputStream;
18  import java.io.InputStreamReader;
19  import java.io.Reader;
20  import java.nio.charset.Charset;
21  
22  import org.openrdf.model.Resource;
23  import org.openrdf.model.Statement;
24  import org.openrdf.model.URI;
25  import org.openrdf.model.Value;
26  import org.openrdf.model.ValueFactory;
27  import org.openrdf.model.impl.ValueFactoryImpl;
28  import org.openrdf.model.vocabulary.SESAME;
29  import org.openrdf.rio.RDFFormat;
30  import org.openrdf.rio.RDFHandlerException;
31  import org.openrdf.rio.RDFParseException;
32  import org.openrdf.rio.helpers.NTriplesParserSettings;
33  import org.openrdf.rio.helpers.RDFParserBase;
34  
35  /**
36   * A parser that can parse RDF documents that are in the Turtle Quads (TQL) format. TQL is N-Quads
37   * with the more permissive (and efficient!) Turtle encoding. TQL is used in DBpedia exports and
38   * is supported in input by the Virtuoso triple store.
39   */
40  public class TQLParser extends RDFParserBase {
41  
42      private static final int EOF = -1;
43  
44      private Reader reader;
45  
46      private int lineNo;
47  
48      private StringBuilder builder;
49  
50      private Value value;
51  
52      /**
53       * Creates a new TQLParser that will use a {@link ValueFactoryImpl} to create RDF model
54       * objects.
55       */
56      public TQLParser() {
57          super();
58      }
59  
60      /**
61       * Creates a new TQLParser that will use the supplied ValueFactory to create RDF model
62       * objects.
63       *
64       * @param valueFactory
65       *            the ValueFactory to use
66       */
67      public TQLParser(final ValueFactory valueFactory) {
68          super(valueFactory);
69      }
70  
71      @Override
72      public RDFFormat getRDFFormat() {
73          return TQL.FORMAT;
74      }
75  
76      @Override
77      public void parse(final InputStream stream, final String baseURI) throws IOException,
78              RDFParseException, RDFHandlerException {
79          parse(new InputStreamReader(stream, Charset.forName("UTF-8")), baseURI);
80      }
81  
82      @Override
83      public void parse(final Reader reader, final String baseURI) throws IOException,
84              RDFParseException, RDFHandlerException {
85  
86          if (reader == null) {
87              throw new NullPointerException("Null reader");
88          }
89  
90          if (this.rdfHandler != null) {
91              this.rdfHandler.startRDF();
92          }
93  
94          this.reader = reader;
95          this.lineNo = 1;
96          this.builder = new StringBuilder(1024);
97          this.value = null;
98  
99          reportLocation(this.lineNo, 1);
100 
101         try {
102             int c = read();
103             c = skipWhitespace(c);
104             while (c != EOF) {
105                 if (c == '#') {
106                     c = skipLine(c);
107                 } else if (c == '\r' || c == '\n') {
108                     c = skipLine(c);
109                 } else {
110                     c = parseQuad(c);
111                 }
112                 c = skipWhitespace(c);
113             }
114         } finally {
115             clear();
116             this.reader = null;
117             this.builder = null;
118             this.value = null;
119         }
120 
121         if (this.rdfHandler != null) {
122             this.rdfHandler.endRDF();
123         }
124     }
125 
126     private int skipLine(final int ch) throws IOException {
127         int c = ch;
128         while (c != EOF && c != '\r' && c != '\n') {
129             c = read();
130         }
131         if (c == '\n') {
132             c = read();
133             this.lineNo++;
134             reportLocation(this.lineNo, 1);
135         } else if (c == '\r') {
136             c = read();
137             if (c == '\n') {
138                 c = read();
139             }
140             this.lineNo++;
141             reportLocation(this.lineNo, 1);
142         }
143         return c;
144     }
145 
146     private int skipWhitespace(final int ch) throws IOException {
147         int c = ch;
148         while (c == ' ' || c == '\t') {
149             c = read();
150         }
151         return c;
152     }
153 
154     private int parseQuad(final int ch) throws IOException, RDFParseException, RDFHandlerException {
155 
156         int c = ch;
157         try {
158             c = parseResource(c);
159             boolean periodConsumed = (c & 0x80000000) != 0;
160             final Resource subject = (Resource) this.value;
161             if (periodConsumed) {
162                 throwParseException("Found unexpected '.' " + (char) c);
163             }
164 
165             c = skipWhitespace(c);
166             c = parseURI(c);
167             periodConsumed = (c & 0x80000000) != 0;
168             final URI predicate = (URI) this.value;
169             if (periodConsumed) {
170                 throwParseException("Found unexpected '.' " + (char) c);
171             }
172 
173             c = skipWhitespace(c);
174             c = parseValue(c);
175             periodConsumed = (c & 0x80000000) != 0;
176             final Value object = this.value;
177 
178             Resource context = null;
179             if (!periodConsumed) {
180                 c = skipWhitespace(c);
181                 if (c != '.') {
182                     c = parseResource(c);
183                     periodConsumed = (c & 0x80000000) != 0;
184                     context = (Resource) this.value;
185                     if (!periodConsumed) {
186                         c = skipWhitespace(c);
187                     }
188                 }
189             }
190 
191             if (c == EOF) {
192                 throwEOFException();
193             } else if (c != '.' && !periodConsumed) {
194                 throwParseException("Expected '.', found: " + (char) c);
195             }
196 
197             c = periodConsumed ? c & 0x7FFFFFFF : read();
198             c = skipWhitespace(c);
199             if (c != EOF && c != '\r' && c != '\n') {
200                 throwParseException("Content after '.' is not allowed");
201             }
202 
203             if (this.rdfHandler != null) {
204                 final Statement statement;
205                 if (context == null || context.equals(SESAME.NIL)) {
206                     statement = createStatement(subject, predicate, object);
207                 } else {
208                     statement = createStatement(subject, predicate, object, context);
209                 }
210                 this.rdfHandler.handleStatement(statement);
211             }
212 
213         } catch (final RDFParseException ex) {
214             if (getParserConfig().isNonFatalError(
215                     NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES)) {
216                 reportError(ex, this.lineNo, -1,
217                         NTriplesParserSettings.FAIL_ON_NTRIPLES_INVALID_LINES);
218             } else {
219                 throw ex;
220             }
221         }
222 
223         c = skipLine(c);
224         return c;
225     }
226 
227     private int parseValue(final int ch) throws IOException, RDFParseException {
228         int c = ch;
229         if (c == '<') {
230             c = parseURI(c);
231         } else if (c == '_') {
232             c = parseBNode(c);
233         } else if (c == '"' || c == '\'') {
234             c = parseLiteral(c);
235         } else if (c == EOF) {
236             throwEOFException();
237         } else {
238             throwParseException("Expected '<', '_' or '\"', found: " + (char) c + "");
239         }
240         return c;
241     }
242 
243     private int parseResource(final int ch) throws IOException, RDFParseException {
244         int c = ch;
245         if (c == '<') {
246             c = parseURI(c);
247         } else if (c == '_') {
248             c = parseBNode(c);
249         } else if (c == EOF) {
250             throwEOFException();
251         } else {
252             throwParseException("Expected '<' or '_', found: " + (char) c);
253         }
254         return c;
255     }
256 
257     private int parseURI(final int ch) throws IOException, RDFParseException {
258         int c = ch;
259         if (c != '<') {
260             throwParseException("Supplied char should be a '<', it is: " + c);
261         }
262         this.builder.setLength(0);
263         c = read();
264         while (c != '>') {
265             switch (c) {
266             case EOF:
267                 throwEOFException();
268                 break;
269             case '\\':
270                 c = read();
271                 if (c == EOF) {
272                     throwEOFException();
273                 } else if (c == 'u' || c == 'U') {
274                     parseUChar(c);
275                 } else {
276                     this.builder.append((char) c); // accept \> and \\ plus others
277                 }
278                 break;
279             default:
280                 if (c < 32) { // discard control chars but accept other chars forbidden by W3C
281                     // rec, for compatibility with previous Turtle specification
282                     throwParseException("Expected valid IRI char, found: " + (char) c);
283                 }
284                 this.builder.append((char) c);
285                 break;
286             }
287             c = read();
288         }
289         this.value = createURI(this.builder.toString());
290         c = read();
291         return c;
292     }
293 
294     private int parseBNode(final int ch) throws IOException, RDFParseException {
295         int c = ch;
296         if (c != '_') {
297             throwParseException("Expected '_', found: " + c);
298         }
299         c = read();
300         if (c == EOF) {
301             throwEOFException();
302         } else if (c != ':') {
303             throwParseException("Expected ':', found: " + (char) c);
304         }
305         c = read();
306         if (c == EOF) {
307             throwEOFException();
308         } else if (!TQL.isPN_CHARS_U(c) && !TQL.isNumber(c)) {
309             throwParseException("Invalid bnode character: " + (char) c);
310         }
311         this.builder.setLength(0);
312         this.builder.append((char) c);
313         c = read();
314         while (c != EOF && TQL.isPN_CHARS(c)) {
315             this.builder.append((char) c);
316             c = read();
317         }
318         final int last = this.builder.length() - 1;
319         if (this.builder.charAt(last) == '.') {
320             this.builder.setLength(last); // remove trailing '.' and mark period found
321             c = c | 0x80000000;
322         }
323         this.value = createBNode(this.builder.toString());
324         return c;
325     }
326 
327     private int parseLiteral(final int ch) throws IOException, RDFParseException {
328         int c = ch;
329         if (c != '"' && c != '\'') {
330             throwParseException("Expected '\"' or '\'', found: " + c);
331         }
332         final int delim = c;
333         this.builder.setLength(0);
334         c = read();
335         while (c != delim) {
336             if (c == EOF) {
337                 throwEOFException();
338             } else if (c == '\\') {
339                 c = read();
340                 switch (c) {
341                 case EOF:
342                     throwEOFException();
343                     break;
344                 case 'b':
345                     this.builder.append('\b');
346                     break;
347                 case 'f':
348                     this.builder.append('\f');
349                     break;
350                 case 'n':
351                     this.builder.append('\n');
352                     break;
353                 case 'r':
354                     this.builder.append('\r');
355                     break;
356                 case 't':
357                     this.builder.append('\t');
358                     break;
359                 case 'u':
360                 case 'U':
361                     parseUChar(c);
362                     break;
363                 default:
364                     this.builder.append((char) c); // handles ' " \
365                     break;
366                 }
367             } else {
368                 this.builder.append((char) c);
369             }
370             c = read();
371         }
372         c = read();
373         final String label = this.builder.toString();
374         if (c == '@') {
375             this.builder.setLength(0);
376             c = read();
377             boolean minusFound = false;
378             while (true) {
379                 if (c == '-' && this.builder.length() > 0) {
380                     minusFound = true;
381                 } else if (!TQL.isLetter(c) && !(TQL.isNumber(c) && minusFound)) {
382                     break;
383                 }
384                 this.builder.append((char) c);
385                 c = read();
386             }
387             if (this.builder.charAt(this.builder.length() - 1) == '-') {
388                 throwParseException("Invalid lang tag: " + this.builder.toString());
389             }
390             final String language = this.builder.toString();
391             this.value = createLiteral(label, language, null, this.lineNo, -1);
392         } else if (c == '^') {
393             c = read();
394             if (c == EOF) {
395                 throwEOFException();
396             } else if (c != '^') {
397                 throwParseException("Expected '^', found: " + (char) c);
398             }
399             c = read();
400             if (c == EOF) {
401                 throwEOFException();
402             } else if (c != '<') {
403                 throwParseException("Expected '<', found: " + (char) c);
404             }
405             c = parseURI(c);
406             final URI datatype = (URI) this.value;
407             this.value = createLiteral(label, null, datatype, this.lineNo, -1);
408         } else {
409             this.value = createLiteral(label, null, null, this.lineNo, -1);
410         }
411         return c;
412     }
413 
414     private void parseUChar(final int ch) throws IOException, RDFParseException {
415         int c = ch;
416         int count = 0;
417         if (c == 'u') {
418             count = 4;
419         } else if (c == 'U') {
420             count = 8;
421         } else {
422             throwParseException("Expected 'u' or 'U', found: " + c);
423         }
424         int code = 0;
425         for (int i = 0; i < count; ++i) {
426             c = read();
427             if (c == EOF) {
428                 throwEOFException();
429             } else {
430                 final int digit = Character.digit(c, 16);
431                 if (digit < 0) {
432                     throwParseException("Expected hex digit, found: " + (char) c);
433                 }
434                 code = code * 16 + digit;
435             }
436         }
437         this.builder.append((char) code);
438     }
439 
440     private int read() throws IOException {
441         return this.reader.read();
442     }
443 
444     private void throwEOFException() throws RDFParseException {
445         throw new RDFParseException("Unexpected end of file", this.lineNo, -1);
446     }
447 
448     private void throwParseException(final String message) throws RDFParseException {
449         throw new RDFParseException(message, this.lineNo, -1);
450     }
451 
452 }