1   /*
2    * RDFpro - An extensible tool for building stream-oriented RDF processing libraries.
3    * 
4    * Written in 2014 by Francesco Corcoglioniti with support by Marco Amadori, Michele Mostarda,
5    * Alessio Palmero Aprosio and Marco Rospocher. Contact info on http://rdfpro.fbk.eu/
6    * 
7    * To the extent possible under law, the authors have dedicated all copyright and related and
8    * neighboring rights to this software to the public domain worldwide. This software is
9    * distributed without any warranty.
10   * 
11   * You should have received a copy of the CC0 Public Domain Dedication along with this software.
12   * If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
13   */
14  package eu.fbk.rdfpro;
15  
16  import java.io.Serializable;
17  import java.util.Objects;
18  
19  /**
20   * A set/multiset operator.
21   */
22  public abstract class SetOperator implements Serializable, Comparable<SetOperator> {
23  
24      private static final long serialVersionUID = 1L;
25  
26      /**
27       * Set union. This {@code SetOperator} ignores duplicates and returns a unique copy of an
28       * element only if it occurs at least in one of the arguments of the operation. The
29       * corresponding textual representation is "u".
30       */
31      public static final SetOperator UNION = new SetOperator("u") {
32  
33          private static final long serialVersionUID = 1L;
34  
35          @Override
36          public int apply(final int[] counts) {
37              for (int i = 0; i < counts.length; ++i) {
38                  if (counts[i] > 0) {
39                      return 1;
40                  }
41              }
42              return 0;
43          }
44  
45      };
46  
47      /**
48       * Multiset union. This {@code SetOperator} returns an element if it occurs in at least one
49       * argument of the operation, with a multiplicity equal to the one of the argument it occurs
50       * the most. The corresponding textual representation is "U". See <a
51       * href="http://en.wikipedia.org/wiki/Multiset">this page</a> for more information.
52       */
53      public static final SetOperator UNION_MULTISET = new SetOperator("U") {
54  
55          private static final long serialVersionUID = 1L;
56  
57          @Override
58          public int apply(final int[] counts) {
59              int result = counts[0];
60              for (int i = 1; i < counts.length; ++i) {
61                  if (counts[i] > result) {
62                      result = counts[i];
63                  }
64              }
65              return result;
66          }
67  
68      };
69  
70      /**
71       * Multiset sum (a.k.a. union all). This {@code SetOperator} returns an element if it occurs
72       * at least in one argument of the operation, with a multiplicity that is the sum of the
73       * multiplicities of the element in the multisets coming from the operation arguments. The
74       * corresponding textual representation is "a". See <a
75       * href="http://en.wikipedia.org/wiki/Multiset">this page</a> for more information.
76       */
77      public static final SetOperator SUM_MULTISET = new SetOperator("a") {
78  
79          private static final long serialVersionUID = 1L;
80  
81          @Override
82          public int apply(final int[] counts) {
83              int result = 0;
84              for (int i = 0; i < counts.length; ++i) {
85                  result += counts[i];
86              }
87              return result;
88          }
89  
90      };
91  
92      /**
93       * Set intersection. This {@code SetOperator} ignores duplicates and returns an element only
94       * if it occurs in all the arguments of the operation. In that case, exactly one copy of the
95       * element is emitted. The corresponding textual representation is "i".
96       */
97      public static final SetOperator INTERSECTION = new SetOperator("i") {
98  
99          private static final long serialVersionUID = 1L;
100 
101         @Override
102         public int apply(final int[] counts) {
103             for (int i = 0; i < counts.length; ++i) {
104                 if (counts[i] == 0) {
105                     return 0;
106                 }
107             }
108             return 1;
109         }
110 
111     };
112 
113     /**
114      * Multiset intersection. This {@code SetOperator} returns an element if it occurs in all the
115      * arguments of the operation, with a multiplicity equal to the one of the argument it occurs
116      * the least. The corresponding textual representation is "I". See <a
117      * href="http://en.wikipedia.org/wiki/Multiset">this page</a> for more information.
118      */
119     public static final SetOperator INTERSECTION_MULTISET = new SetOperator("I") {
120 
121         private static final long serialVersionUID = 1L;
122 
123         @Override
124         public int apply(final int[] counts) {
125             int result = counts[0];
126             for (int i = 1; i < counts.length; ++i) {
127                 if (counts[i] < result) {
128                     result = counts[i];
129                 }
130             }
131             return result;
132         }
133 
134     };
135 
136     /**
137      * Set difference. This {@code SetOperator} ignores duplicates and returns the set difference
138      * between the first argument (index 0) on one side, and all the other arguments on the other
139      * side (i.e., {@code A0 \ A1 ... \An} or equivalently {@code A0 \ (A1 U ... U An)}). At most
140      * a copy of a given element is returned. The corresponding textual representation is "d".
141      */
142     public static final SetOperator DIFFERENCE = new SetOperator("d") {
143 
144         private static final long serialVersionUID = 1L;
145 
146         @Override
147         public int apply(final int[] counts) {
148             if (counts[0] == 0) {
149                 return 0;
150             }
151             for (int i = 1; i < counts.length; ++i) {
152                 if (counts[i] > 0) {
153                     return 0;
154                 }
155             }
156             return 1;
157         }
158 
159     };
160 
161     /**
162      * Multiset difference. This {@code SetOperator} returns the multiset difference between the
163      * first argument (index 0) on one side, and all the other arguments on the other side (i.e.,
164      * {@code A0 \ A1 ... \An} or equivalently {@code A0 \ (A1 + ... + An)} where {@code +}
165      * denotes multiset sum). The multiplicity of a returned element is equal to its multiplicity
166      * in the first argument minus the sum of its multiplicities in the other arguments (rounded
167      * to 0 if negative). The corresponding textual representation is "D". See <a
168      * href="http://en.wikipedia.org/wiki/Multiset">this page</a> for more information.
169      */
170     public static final SetOperator DIFFERENCE_MULTISET = new SetOperator("D") {
171 
172         private static final long serialVersionUID = 1L;
173 
174         @Override
175         public int apply(final int[] counts) {
176             int result = counts[0];
177             for (int i = 1; i < counts.length && result > 0; ++i) {
178                 result -= counts[i];
179             }
180             return result > 0 ? result : 0;
181         }
182 
183     };
184 
185     /**
186      * Set symmetric difference. This {@code SetOperator} ignores duplicates and returns all the
187      * elements that are present in at least one argument of the operator but not in all of them.
188      * For two arguments, this corresponds to evaluating {@code (A0 \ A1) U (A1 \ A0)}. At most a
189      * copy of an element is returned. The corresponding textual representation is "s".
190      */
191     public static final SetOperator SYMMETRIC_DIFFERENCE = new SetOperator("s") {
192 
193         private static final long serialVersionUID = 1L;
194 
195         @Override
196         public int apply(final int[] counts) {
197             int result = 0;
198             for (int i = 0; i < counts.length; ++i) {
199                 if (counts[i] > 0) {
200                     ++result;
201                 }
202             }
203             return result < counts.length ? 1 : 0;
204         }
205 
206     };
207 
208     /**
209      * Multiset symmetric difference. This {@code SetOperator} returns the elements that are
210      * present in at least one argument of the operator, with a multiplicity that is the
211      * difference between the multiplicity in the argument it occurs the most and the multiplicity
212      * in the argument it occurs the least. For two arguments, this corresponds to evaluating (the
213      * multiset version of) {@code (A0 \ A1) U (A1 \ A0)}. The corresponding textual
214      * representation is "S". See <a href="http://en.wikipedia.org/wiki/Multiset">this page</a>
215      * for more information.
216      */
217     public static final SetOperator SYMMETRIC_DIFFERENCE_MULTISET = new SetOperator("S") {
218 
219         private static final long serialVersionUID = 1L;
220 
221         @Override
222         public int apply(final int[] counts) {
223             int min = counts[0];
224             int max = min;
225             for (int i = 1; i < counts.length; ++i) {
226                 final int n = counts[i];
227                 if (n < min) {
228                     min = n;
229                 }
230                 if (n > max) {
231                     max = n;
232                 }
233             }
234             return max - min;
235         }
236 
237     };
238 
239     /**
240      * At least N occurrences. Creates a parametric {@code SetOperator} that checks that the total
241      * number of occurrences of a certain element, summed over all the operation arguments and
242      * counting duplicates, is AT LEAST N. In that case a single copy of the element is emitted.
243      * The corresponding textual representation is "N+".
244      *
245      * @param n
246      *            the threshold, greater or equal to 1
247      * @return the created {@code SetOperator} object
248      */
249     public static SetOperator atLeast(final int n) {
250         if (n < 1) {
251             throw new IllegalArgumentException("Invalid threshold " + n);
252         }
253         return new SetOperator(n + "+") {
254 
255             private static final long serialVersionUID = 1L;
256 
257             @Override
258             public int apply(final int[] counts) {
259                 int result = 0;
260                 for (int i = 0; i < counts.length; ++i) {
261                     if (counts[i] > 0) {
262                         ++result;
263                     }
264                 }
265                 return result >= n ? 1 : 0;
266             }
267 
268         };
269     }
270 
271     /**
272      * At most N occurrences. Creates a parametric {@code SetOperator} that checks that the total
273      * number of occurrences of a certain element, summed over all the operation arguments and
274      * counting duplicates, is AT MOST N. In that case a single copy of the element is emitted.
275      * The corresponding textual representation is "N-".
276      *
277      * @param n
278      *            the threshold, greater or equal to 1
279      * @return the created {@code SetOperator} object
280      */
281     public static SetOperator atMost(final int n) {
282         if (n < 1) {
283             throw new IllegalArgumentException("Invalid threshold " + n);
284         }
285         return new SetOperator(n + "-") {
286 
287             private static final long serialVersionUID = 1L;
288 
289             @Override
290             public int apply(final int[] counts) {
291                 int result = 0;
292                 for (int i = 0; i < counts.length; ++i) {
293                     result += counts[i];
294                 }
295                 return result <= n ? 1 : 0;
296             }
297 
298         };
299     }
300 
301     private final String string;
302 
303     private SetOperator(final String string) {
304         this.string = string;
305     }
306 
307     private Object writeReplace() {
308         return new SerializedForm(this.string);
309     }
310 
311     /**
312      * Applies the operator. The method is called for a given element, supplying it the
313      * multiplicities of the element in each argument (a set or a multiset) of the operation. The
314      * method returns the multiplicity of the element in the operation result.
315      *
316      * @param multiplicities
317      *            the multiplicities of the element in the operation arguments (non-null,
318      *            non-empty array)
319      * @return the resulting multiplicity
320      */
321     public abstract int apply(int[] multiplicities);
322 
323     /**
324      * {@inheritDoc} Comparison is based on the textual representation of set operators.
325      */
326     @Override
327     public final int compareTo(final SetOperator other) {
328         return this.string.compareTo(other.string);
329     }
330 
331     /**
332      * {@inheritDoc} Equality is implemented based on the textual representation of set operators.
333      */
334     @Override
335     public final boolean equals(final Object object) {
336         if (object == this) {
337             return true;
338         }
339         if (!(object instanceof SetOperator)) {
340             return false;
341         }
342         final SetOperator other = (SetOperator) object;
343         return this.string.equals(other.string);
344     }
345 
346     /**
347      * {@inheritDoc} The hash code is computed based on the textual representation of the set
348      * operator.
349      */
350     @Override
351     public final int hashCode() {
352         return this.string.hashCode();
353     }
354 
355     /**
356      * {@inheritDoc} Returns the textual representation of the set operator that uniquely
357      * identifies it.
358      */
359     @Override
360     public final String toString() {
361         return this.string;
362     }
363 
364     /**
365      * Returns the {@code SetOperator} for the textual representation supplied.
366      *
367      * @param string
368      *            the textual representation, not null
369      * @return the corresponding {@code SetOperator}, upon success
370      */
371     public static SetOperator valueOf(final String string) {
372         Objects.requireNonNull(string);
373         for (final SetOperator operation : new SetOperator[] { UNION, UNION_MULTISET,
374                 SUM_MULTISET, INTERSECTION, INTERSECTION_MULTISET, DIFFERENCE,
375                 DIFFERENCE_MULTISET, SYMMETRIC_DIFFERENCE, SYMMETRIC_DIFFERENCE_MULTISET }) {
376             if (operation.string.equals(string)) {
377                 return operation;
378             }
379         }
380         if (string.endsWith("+")) {
381             final int n = Integer.parseInt(string.substring(0, string.length() - 1));
382             return atLeast(n);
383         }
384         if (string.endsWith("-")) {
385             final int n = Integer.parseInt(string.substring(0, string.length() - 1));
386             return atMost(n);
387         }
388         throw new IllegalArgumentException("Unknown set operator '" + string + "'");
389     }
390 
391     static final class SerializedForm implements Serializable {
392 
393         private static final long serialVersionUID = 1L;
394 
395         private final String string;
396 
397         public SerializedForm(final String string) {
398             this.string = string;
399         }
400 
401         Object readResolve() {
402             return SetOperator.valueOf(this.string);
403         }
404 
405     }
406 
407 }