View Javadoc

1   package io.github.reggert.reb4j.charclass;
2   
3   import static fj.Ord.charOrd;
4   
5   import io.github.reggert.reb4j.AbstractQuantifiableSequenceableAlternative;
6   import io.github.reggert.reb4j.Alternative;
7   import io.github.reggert.reb4j.Expression;
8   import io.github.reggert.reb4j.Quantifiable;
9   import io.github.reggert.reb4j.Sequenceable;
10  
11  import java.lang.Character.UnicodeBlock;
12  
13  
14  import fj.data.LazyString;
15  import fj.data.Set;
16  
17  /**
18   * Base class representing an expression that matches a single character 
19   * within a class of characters.
20   */
21  public abstract class CharClass extends AbstractQuantifiableSequenceableAlternative
22  	implements Expression, Quantifiable, Sequenceable, Alternative
23  {
24  	private static final long serialVersionUID = 1L;
25  
26  	/**
27  	 * The regular expression string that can be used within square brackets
28  	 * to merge with other character classes.
29  	 */
30  	protected abstract LazyString unitableForm();
31  	
32  	/**
33  	 * The regular expression string that can be used independently of square 
34  	 * brackets.
35  	 */
36  	protected abstract LazyString independentForm();
37  	
38  	@Override
39  	public final LazyString expression()
40  	{return independentForm();}
41  	
42  	@Override
43  	public Integer boundedLength() 
44  	{
45  		return 1;
46  	}
47  
48  	@Override
49  	public boolean repetitionInvalidatesBounds() 
50  	{
51  		return false;
52  	}
53  
54  	@Override
55  	public boolean possiblyZeroLength() 
56  	{
57  		return false;
58  	}
59  
60  	/**
61  	 * Returns an expressing matching a single character that is not within
62  	 * the class of characters matched by this expression.
63  	 */
64  	public abstract CharClass negated();
65  	
66  	/**
67  	 * Returns the union of this character class with the specified
68  	 * character classes.
69  	 */
70  	public Union union(final Union right)
71  	{return Union.union(this, right);}
72  
73  	/**
74  	 * Returns the union of this character class with the specified
75  	 * character class.
76  	 */
77  	public Union union(final CharClass right)
78  	{return Union.union(this, right);}
79  	
80  	/**
81  	 * Returns the intersection of this character class with the 
82  	 * specified character class.
83  	 */
84  	public Intersection intersect(final CharClass right)
85  	{return Intersection.intersect(this, right);}
86  
87  	/**
88  	 * Returns the intersection of this character class with the 
89  	 * specified character classes.
90  	 */
91  	public Intersection intersect(final Intersection right)
92  	{return Intersection.intersect(this, right);}
93  	
94  	/**
95  	 * Constructs a character class consisting of a single character.
96  	 */
97  	public static SingleChar character(final char c)
98  	{return new SingleChar(c);}
99  	
100 	/**
101 	 * Constructs a character class consisting of several characters.
102 	 */
103 	public static MultiChar characters(final char c1, final char c2, final char... cs)
104 	{
105 		Set<Character> set = Set.set(charOrd, c1, c2);
106 		for (final char c : cs)
107 			set = set.insert(c);
108 		return new MultiChar(set);
109 	}
110 	
111 	/**
112 	 * Constructs a character class consisting of all characters in
113 	 * a range.
114 	 * 
115 	 * @param first
116 	 * 	the minimum bound for the range.
117 	 * @param last
118 	 * 	the maximum bound for the range; must be greater than <var>first</var>.
119 	 * @throws IllegalArgumentException
120 	 * 	if <var>last</var> &lt;= <var>first</var>.
121 	 */
122 	public static CharRange range(final char first, final char last)
123 	{return new CharRange(first, last);}
124 	
125 	/**
126 	 * Module containing Perl-style predefined character classes.
127 	 */
128 	public static abstract class Perl
129 	{
130 		private Perl() {}
131 		
132 		/**
133 		 * Perl-style character class that matches a single decimal digit.
134 		 */
135 		public static final PredefinedClass DIGIT = new PredefinedClass('d');
136 		
137 		/**
138 		 * Perl-style character class that matches whitespace.
139 		 */
140 		public static final PredefinedClass SPACE = new PredefinedClass('s');
141 		
142 		/**
143 		 * Perl-style character class that matches "word" characters.
144 		 */
145 		public static final PredefinedClass WORD = new PredefinedClass('w');
146 	}
147 	
148 	/**
149 	 * Module containing POSIX-style predefined character classes.
150 	 */
151 	public static abstract class Posix
152 	{
153 		private Posix() {}
154 		
155 		/**
156 		 * POSIX-style character class that matches a lowercase letter.
157 		 */
158 		public static final NamedPredefinedClass LOWER = new NamedPredefinedClass("Lower");
159 		
160 		/**
161 		 * POSIX-style character class that matches an uppercase letter.
162 		 */
163 		public static final NamedPredefinedClass UPPER = new NamedPredefinedClass("Upper");
164 		
165 		/**
166 		 * POSIX-style character class that matches an alphabetical letter.
167 		 */
168 		public static final NamedPredefinedClass ALPHA = new NamedPredefinedClass("Alpha");
169 		
170 		/**
171 		 * POSIX-style character class that matches a decimal digit.
172 		 */
173 		public static final NamedPredefinedClass DIGIT = new NamedPredefinedClass("Digit");
174 		
175 		/**
176 		 * POSIX-style character class that matches letters or digits.
177 		 */
178 		public static final NamedPredefinedClass ALNUM = new NamedPredefinedClass("Alnum");
179 		
180 		/**
181 		 * POSIX-style character class that matches a punctuation character.
182 		 */
183 		public static final NamedPredefinedClass PUNCT = new NamedPredefinedClass("Punct");
184 		
185 		/**
186 		 * POSIX-style character class that matches a graphical character.
187 		 */
188 		public static final NamedPredefinedClass GRAPH = new NamedPredefinedClass("Graph");
189 		
190 		/**
191 		 * POSIX-style character class that matches any printable character.
192 		 */
193 		public static final NamedPredefinedClass PRINT = new NamedPredefinedClass("Print");
194 		
195 		/**
196 		 * POSIX-style character class that matches a space or tab.
197 		 */
198 		public static final NamedPredefinedClass BLANK = new NamedPredefinedClass("Blank");
199 		
200 		/**
201 		 * POSIX-style character class that matches a control character.
202 		 */
203 		public static final NamedPredefinedClass CONTROL = new NamedPredefinedClass("Cntrl");
204 		
205 		/**
206 		 * POSIX-style character class that matches a hexadecimal digit.
207 		 */
208 		public static final NamedPredefinedClass HEX_DIGIT = new NamedPredefinedClass("XDigit");
209 		
210 		/**
211 		 * POSIX-style character class that matches any whitespace character.
212 		 */
213 		public static final NamedPredefinedClass SPACE = new NamedPredefinedClass("Space");
214 	}
215 	
216 	/**
217 	 * Module containing predefined character classes matching character traits
218 	 * defined by the {@link java.lang.Character} class.
219 	 */
220 	public static abstract class Java
221 	{
222 		private Java() {}
223 		
224 		/**
225 		 * Matches any single lowercase letter.
226 		 */
227 		public static final NamedPredefinedClass LOWER_CASE = new NamedPredefinedClass("javaLowerCase");
228 		
229 		/**
230 		 * Matches any single uppercase letter.
231 		 */
232 		public static final NamedPredefinedClass UPPER_CASE = new NamedPredefinedClass("javaUpperCase");
233 		
234 		/**
235 		 * Matches any single whitespace character.
236 		 */
237 		public static final NamedPredefinedClass WHITESPACE = new NamedPredefinedClass("javaWhitespace");
238 		
239 		/**
240 		 * Matches any single "mirrored" character.
241 		 */
242 		public static final NamedPredefinedClass MIRROR = new NamedPredefinedClass("javaMirrored");
243 	}
244 	
245 	/**
246 	 * Module containing predefined character classes matching a single
247 	 * character that has certain traits defined by the Unicode specification.
248 	 */
249 	public static abstract class Unicode
250 	{
251 		private Unicode() {}
252 		private static NamedPredefinedClass z (final String className) 
253 		{return new NamedPredefinedClass(className);}
254 		
255 		/**
256 		 * Creates a character class matching any single character within the
257 		 * specified Unicode block.
258 		 */
259 		public static NamedPredefinedClass block(final UnicodeBlock unicodeBlock)
260 		{return z("In" + unicodeBlock.toString());}
261 		
262 		/*
263 			From the Unicode Specification, version 4.0.0, the Unicode categories are:
264 				Lu = Letter, uppercase
265 				Ll = Letter, lowercase
266 				Lt = Letter, titlecase
267 				Lm = Letter, modifier
268 				Lo = Letter, other
269 				Mn = Mark, nonspacing
270 				Mc = Mark, spacing combining
271 				Me = Mark, enclosing
272 				Nd = Number, decimal digit
273 				Nl = Number, letter
274 				No = Number, other
275 				Zs = Separator, space
276 				Zl = Separator, line
277 				Zp = Separator, paragraph
278 				Cc = Other, control
279 				Cf = Other, format
280 				Cs = Other, surrogate
281 				Co = Other, private use
282 				Cn = Other, not assigned (including noncharacters)
283 				Pc = Punctuation, connector
284 				Pd = Punctuation, dash
285 				Ps = Punctuation, open
286 				Pe = Punctuation, close
287 				Pi = Punctuation, initial quote (may behave like Ps or Pe depending on usage)
288 				Pf = Punctuation, final quote (may behave like Ps or Pe depending on usage)
289 				Po = Punctuation, other
290 				Sm = Symbol, math
291 				Sc = Symbol, currency
292 				Sk = Symbol, modifier
293 				So = Symbol, other
294 		 */
295 		
296 		/**
297 		 * Module containing predefined character classes matching any single
298 		 * character defined as a "letter" by the Unicode specification.
299 		 */
300 		public static abstract class Letter
301 		{
302 			private Letter() {}
303 			
304 			/**
305 			 * Matches uppercase letters.
306 			 */
307 			public static final NamedPredefinedClass UPPER_CASE = z("Lu");
308 			
309 			/**
310 			 * Matches lowercase letters.
311 			 */
312 			public static final NamedPredefinedClass LOWER_CASE = z("Ll");
313 			
314 			/**
315 			 * Matches titlecase letters.
316 			 */
317 			public static final NamedPredefinedClass TITLE_CASE = z("Lt");
318 			
319 			/**
320 			 * Matches letter modifiers.
321 			 */
322 			public static final NamedPredefinedClass MODIFIER = z("Lm");
323 			
324 			/**
325 			 * Matches "other" letters defined by Unicode.
326 			 */
327 			public static final NamedPredefinedClass OTHER = z("Lo");
328 		}
329 		
330 		/**
331 		 * Module containing predefined character classes matching any single
332 		 * character defined as a "mark" by the Unicode specification.
333 		 */
334 		public static abstract class Mark
335 		{
336 			private Mark() {}
337 			
338 			/**
339 			 * Matches nonspacing marks.
340 			 */
341 			public static final NamedPredefinedClass NONSPACING = z("Mn");
342 			
343 			/**
344 			 * Matches spacing-combining marks.
345 			 */
346 			public static final NamedPredefinedClass SPACING_COMBINING = z("Mc");
347 			
348 			/**
349 			 * Matches enclosing marks.
350 			 */
351 			public static final NamedPredefinedClass ENCLOSING = z("Me");
352 		}
353 		
354 		/**
355 		 * Module containing predefined character classes matching any single
356 		 * character defined as a "number" by the Unicode specification.
357 		 */
358 		public static abstract class Number
359 		{
360 			private Number() {}
361 			
362 			/**
363 			 * Matches decimal digits.
364 			 */
365 			public static final NamedPredefinedClass DECIMAL_DIGIT = z("Nd");
366 			
367 			/**
368 			 * Matches letter characters used as digits.
369 			 */
370 			public static final NamedPredefinedClass LETTER = z("Nl");
371 			
372 			/**
373 			 * Matches "other" digit characters defined by Unicode.
374 			 */
375 			public static final NamedPredefinedClass OTHER = z("No");
376 		}
377 		
378 		/**
379 		 * Module containing predefined character classes matching any single
380 		 * character defined as a "separator" by the Unicode specification.
381 		 */
382 		public static abstract class Separator
383 		{
384 			private Separator() {}
385 			
386 			/**
387 			 * Mataches spaces.
388 			 */
389 			public static final NamedPredefinedClass SPACE = z("Zs");
390 			
391 			/**
392 			 * Matches line breaks.
393 			 */
394 			public static final NamedPredefinedClass LINE = z("Zl");
395 			
396 			/**
397 			 * Matches paragraph breaks.
398 			 */
399 			public static final NamedPredefinedClass PARAGRAPH = z("Zp");
400 		}
401 		
402 		/**
403 		 * Module containing predefined character classes matching any single
404 		 * character that does not fit into any other category defined by the
405 		 * Unicode specification.
406 		 */
407 		public static abstract class Other
408 		{
409 			private Other() {}
410 			
411 			/**
412 			 * Matches control characters.
413 			 */
414 			public static final NamedPredefinedClass CONTROL = z("Cc");
415 			
416 			/**
417 			 * Matches formatting characters.
418 			 */
419 			public static final NamedPredefinedClass FORMAT = z("Cf");
420 			
421 			/**
422 			 * Matches surrogate characters.
423 			 */
424 			public static final NamedPredefinedClass SURROGATE = z("Cs");
425 			
426 			/**
427 			 * Matches characters defined for private use.
428 			 */
429 			public static final NamedPredefinedClass PRIVATE_USE = z("Co");
430 			
431 			/**
432 			 * Matches unassigned characters.
433 			 */
434 			public static final NamedPredefinedClass NOT_ASSIGNED = z("Cn");
435 		}
436 		
437 		/**
438 		 * Module containing predefined character classes matching any single
439 		 * character that is defined as "punctuation" by the Unicode 
440 		 * specification.
441 		 */
442 		public static abstract class Punctuation
443 		{
444 			/**
445 			 * Matches connectors.
446 			 */
447 			public static final NamedPredefinedClass CONNECTOR = z("Pc");
448 			
449 			/**
450 			 * Matches dashes.
451 			 */
452 			public static final NamedPredefinedClass DASH = z("Pd");
453 			
454 			/**
455 			 * Matches "opening" punctuation.
456 			 */
457 			public static final NamedPredefinedClass OPEN = z("Po");
458 			
459 			/**
460 			 * Matches "closing" punctuation.
461 			 */
462 			public static final NamedPredefinedClass CLOSE = z("Pe");
463 			
464 			/**
465 			 * Matches initial quotes.
466 			 */
467 			public static final NamedPredefinedClass INITIAL_QUOTE = z("Pi");
468 			
469 			/**
470 			 * Matches closing quotes.
471 			 */
472 			public static final NamedPredefinedClass FINAL_QUOTE = z("Pf");
473 			
474 			/**
475 			 * Matches other punctuation.
476 			 */
477 			public static final NamedPredefinedClass OTHER = z("Po");
478 		}
479 		
480 		/**
481 		 * Module containing predefined character classes matching any single
482 		 * character that is defined as a "symbol" by the Unicode 
483 		 * specification.
484 		 */
485 		public static abstract class Symbol
486 		{
487 			/**
488 			 * Matches mathematical symbols.
489 			 */
490 			public static final NamedPredefinedClass MATH = z("Sm");
491 			
492 			/**
493 			 * Matches currency symbols.
494 			 */
495 			public static final NamedPredefinedClass CURRENCY = z("Sc");
496 			
497 			/**
498 			 * Matches symbol modifiers.
499 			 */
500 			public static final NamedPredefinedClass MODIFIER = z("Sk");
501 			
502 			/**
503 			 * Matches other symbols.
504 			 */
505 			public static final NamedPredefinedClass OTHER = z("So");
506 		}
507 	}
508 }