| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  | package org.nwapw.abacus.lexing.pattern;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import java.util.Collection;
 | 
					
						
							|  |  |  | import java.util.HashMap;
 | 
					
						
							|  |  |  | import java.util.Stack;
 | 
					
						
							|  |  |  | import java.util.function.Function;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * A pattern that can be compiled from a string and used in lexing.
 | 
					
						
							|  |  |  |  * @param <T> the type that is used to identify and sort this pattern.
 | 
					
						
							|  |  |  |  */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  | public class Pattern<T> {
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * The ID of this pattern.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private T id;
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * The head of this pattern.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternNode<T> head;
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * The source string of this pattern.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private String source;
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * The index at which the compilation has stopped.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private int index;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * A map of regex operator to functions that modify a PatternChain
 | 
					
						
							|  |  |  |      * with the appropriate operation.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private HashMap<Character, Function<PatternChain<T>, PatternChain<T>>> operations =
 | 
					
						
							|  |  |  |             new HashMap<Character, Function<PatternChain<T>, PatternChain<T>>>() {{
 | 
					
						
							|  |  |  |                 put('+', Pattern.this::transformPlus);
 | 
					
						
							|  |  |  |                 put('*', Pattern.this::transformStar);
 | 
					
						
							|  |  |  |                 put('?', Pattern.this::transformQuestion);
 | 
					
						
							|  |  |  |             }};
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * A regex operator function that turns the chain
 | 
					
						
							|  |  |  |      * into a one-or-more chain.
 | 
					
						
							|  |  |  |      * @param chain the chain to transform.
 | 
					
						
							|  |  |  |      * @return the modified chain.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> transformPlus(PatternChain<T> chain){
 | 
					
						
							|  |  |  |         chain.tail.outputStates.add(chain.head);
 | 
					
						
							|  |  |  |         return chain;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * A regex operator function that turns the chain
 | 
					
						
							|  |  |  |      * into a zero-or-more chain.
 | 
					
						
							|  |  |  |      * @param chain the chain to transform.
 | 
					
						
							|  |  |  |      * @return the modified chain.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> transformStar(PatternChain<T> chain){
 | 
					
						
							|  |  |  |         LinkNode<T> newTail = new LinkNode<>();
 | 
					
						
							|  |  |  |         LinkNode<T> newHead = new LinkNode<>();
 | 
					
						
							|  |  |  |         newHead.outputStates.add(chain.head);
 | 
					
						
							|  |  |  |         newHead.outputStates.add(newTail);
 | 
					
						
							|  |  |  |         chain.tail.outputStates.add(newTail);
 | 
					
						
							|  |  |  |         newTail.outputStates.add(newHead);
 | 
					
						
							|  |  |  |         chain.head = newHead;
 | 
					
						
							|  |  |  |         chain.tail = newTail;
 | 
					
						
							|  |  |  |         return chain;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * A regex operator function that turns the chain
 | 
					
						
							|  |  |  |      * into a zero-or-one chain.
 | 
					
						
							|  |  |  |      * @param chain the chain to transform.
 | 
					
						
							|  |  |  |      * @return the modified chain.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> transformQuestion(PatternChain<T> chain){
 | 
					
						
							|  |  |  |         LinkNode<T> newTail = new LinkNode<>();
 | 
					
						
							|  |  |  |         LinkNode<T> newHead = new LinkNode<>();
 | 
					
						
							|  |  |  |         newHead.outputStates.add(chain.head);
 | 
					
						
							|  |  |  |         newHead.outputStates.add(newTail);
 | 
					
						
							|  |  |  |         chain.tail.outputStates.add(newTail);
 | 
					
						
							|  |  |  |         chain.head = newHead;
 | 
					
						
							|  |  |  |         chain.tail = newTail;
 | 
					
						
							|  |  |  |         return chain;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Combines a collection of chains into one OR chain.
 | 
					
						
							|  |  |  |      * @param collection the collection of chains to combine.
 | 
					
						
							|  |  |  |      * @return the resulting OR chain.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> combineChains(Collection<PatternChain<T>> collection){
 | 
					
						
							|  |  |  |         LinkNode<T> head = new LinkNode<>();
 | 
					
						
							|  |  |  |         LinkNode<T> tail = new LinkNode<>();
 | 
					
						
							|  |  |  |         PatternChain<T> newChain = new PatternChain<>(head, tail);
 | 
					
						
							|  |  |  |         for(PatternChain<T> chain : collection){
 | 
					
						
							|  |  |  |             head.outputStates.add(chain.head);
 | 
					
						
							|  |  |  |             chain.tail.outputStates.add(tail);
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  |         return newChain;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Parses a single value from the input into a chain.
 | 
					
						
							|  |  |  |      * @return the resulting chain, or null on error.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> parseValue(){
 | 
					
						
							|  |  |  |         if(index >= source.length()) return null;
 | 
					
						
							|  |  |  |         if(source.charAt(index) == '\\'){
 | 
					
						
							|  |  |  |             if(++index >= source.length()) return null;
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  |         return new PatternChain<>(new ValueNode<>(source.charAt(index++)));
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Parses a [] range from the input into a chain.
 | 
					
						
							|  |  |  |      * @return the resulting chain, or null on error.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> parseOr(){
 | 
					
						
							|  |  |  |         Stack<PatternChain<T>> orStack = new Stack<>();
 | 
					
						
							| 
									
										
										
										
											2017-07-24 20:47:13 -07:00
										 |  |  |         index++;
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |         while(index < source.length() && source.charAt(index) != ']'){
 | 
					
						
							|  |  |  |             if(source.charAt(index) == '-'){
 | 
					
						
							|  |  |  |                 index++;
 | 
					
						
							|  |  |  |                 if(orStack.empty() || orStack.peek().tail.range() == '\0') return null;
 | 
					
						
							|  |  |  |                 PatternChain<T> bottomRange = orStack.pop();
 | 
					
						
							|  |  |  |                 PatternChain<T> topRange = parseValue();
 | 
					
						
							|  |  |  |                 if(topRange == null || topRange.tail.range() == '\0') return null;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 orStack.push(new PatternChain<>(new RangeNode<>(bottomRange.tail.range(), topRange.tail.range())));
 | 
					
						
							|  |  |  |             } else {
 | 
					
						
							|  |  |  |                 PatternChain<T> newChain = parseValue();
 | 
					
						
							|  |  |  |                 if(newChain == null) return null;
 | 
					
						
							|  |  |  |                 orStack.push(newChain);
 | 
					
						
							|  |  |  |             }
 | 
					
						
							|  |  |  |         }
 | 
					
						
							| 
									
										
										
										
											2017-07-24 20:47:13 -07:00
										 |  |  |         if(index++ >= source.length()) return null;
 | 
					
						
							|  |  |  |         return (orStack.size() == 1) ? orStack.pop() : combineChains(orStack);
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Parses a repeatable segment from the input into a chain
 | 
					
						
							|  |  |  |      * @param isSubsegment whether the segment is a sub-expression "()", and therefore
 | 
					
						
							|  |  |  |      *                     whether to expect a closing brace.
 | 
					
						
							|  |  |  |      * @return the resulting chain, or null on error.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     private PatternChain<T> parseSegment(boolean isSubsegment){
 | 
					
						
							|  |  |  |         if(index >= source.length() || ((source.charAt(index) != '(') && isSubsegment)) return null;
 | 
					
						
							|  |  |  |         if(isSubsegment) index++;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Stack<PatternChain<T>> orChain = new Stack<>();
 | 
					
						
							|  |  |  |         PatternChain<T> fullChain = new PatternChain<>();
 | 
					
						
							|  |  |  |         PatternChain<T> currentChain = null;
 | 
					
						
							|  |  |  |         while (index < source.length() && source.charAt(index) != ')'){
 | 
					
						
							|  |  |  |             char currentChar = source.charAt(index);
 | 
					
						
							|  |  |  |             if(operations.containsKey(currentChar)){
 | 
					
						
							|  |  |  |                 if(currentChain == null) return null;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 currentChain = operations.get(currentChar).apply(currentChain);
 | 
					
						
							|  |  |  |                 fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 currentChain = null;
 | 
					
						
							|  |  |  |                 index++;
 | 
					
						
							|  |  |  |             } else if(currentChar == '|'){
 | 
					
						
							|  |  |  |                 if(currentChain == null) return null;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 orChain.push(fullChain);
 | 
					
						
							|  |  |  |                 currentChain = null;
 | 
					
						
							|  |  |  |                 fullChain = new PatternChain<>();
 | 
					
						
							| 
									
										
										
										
											2017-07-25 13:53:19 -07:00
										 |  |  |                 if(++index >= source.length()) return null;
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |             } else if(currentChar == '('){
 | 
					
						
							|  |  |  |                 if(currentChain != null) {
 | 
					
						
							|  |  |  |                     fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 currentChain = parseSegment(true);
 | 
					
						
							|  |  |  |                 if(currentChain == null) return null;
 | 
					
						
							|  |  |  |             } else if(currentChar == '['){
 | 
					
						
							|  |  |  |                 if(currentChain != null){
 | 
					
						
							|  |  |  |                     fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 }
 | 
					
						
							|  |  |  |                 currentChain = parseOr();
 | 
					
						
							|  |  |  |                 if(currentChain == null) return null;
 | 
					
						
							| 
									
										
										
										
											2017-07-24 20:47:13 -07:00
										 |  |  |             } else if(currentChar == '.'){
 | 
					
						
							|  |  |  |                 if(currentChain != null){
 | 
					
						
							|  |  |  |                     fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 }
 | 
					
						
							|  |  |  |                 currentChain = new PatternChain<>(new AnyNode<>());
 | 
					
						
							|  |  |  |                 index++;
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |             } else {
 | 
					
						
							|  |  |  |                 if(currentChain != null){
 | 
					
						
							|  |  |  |                     fullChain.append(currentChain);
 | 
					
						
							|  |  |  |                 }
 | 
					
						
							|  |  |  |                 currentChain = parseValue();
 | 
					
						
							|  |  |  |                 if(currentChain == null) return null;
 | 
					
						
							|  |  |  |             }
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if(!(!isSubsegment || (index < source.length() && source.charAt(index) == ')'))) return null;
 | 
					
						
							|  |  |  |         if(isSubsegment) index++;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if(currentChain != null) fullChain.append(currentChain);
 | 
					
						
							|  |  |  |         if(!orChain.empty()){
 | 
					
						
							|  |  |  |             orChain.push(fullChain);
 | 
					
						
							|  |  |  |             fullChain = combineChains(orChain);
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return fullChain;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Creates / compiles a new pattern with the given id from the given string.
 | 
					
						
							|  |  |  |      * @param from the string to compile a pattern from.
 | 
					
						
							|  |  |  |      * @param id the ID to use.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  |     public Pattern(String from, T id){
 | 
					
						
							|  |  |  |         this.id = id;
 | 
					
						
							|  |  |  |         index = 0;
 | 
					
						
							|  |  |  |         source = from;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         PatternChain<T> chain = parseSegment(false);
 | 
					
						
							|  |  |  |         if(chain == null) {
 | 
					
						
							|  |  |  |             head = null;
 | 
					
						
							|  |  |  |         } else {
 | 
					
						
							|  |  |  |             chain.append(new EndNode<>(id));
 | 
					
						
							|  |  |  |             head = chain.head;
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-25 22:47:48 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Gets the head PatternNode, for use in matching
 | 
					
						
							|  |  |  |      * @return the pattern node.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-24 20:47:13 -07:00
										 |  |  |     public PatternNode<T> getHead() {
 | 
					
						
							|  |  |  |         return head;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							| 
									
										
										
										
											2017-07-27 15:26:02 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-07-27 16:27:26 -07:00
										 |  |  |     /**
 | 
					
						
							|  |  |  |      * Removes all characters that are considered "special" from
 | 
					
						
							|  |  |  |      * the given string.
 | 
					
						
							|  |  |  |      * @param from the string to sanitize.
 | 
					
						
							|  |  |  |      * @return the resulting string.
 | 
					
						
							|  |  |  |      */
 | 
					
						
							| 
									
										
										
										
											2017-07-27 15:26:02 -07:00
										 |  |  |     public static String sanitize(String from){
 | 
					
						
							|  |  |  |         Pattern<Integer> pattern = new Pattern<>("", 0);
 | 
					
						
							|  |  |  |         from = from.replace(".", "\\.");
 | 
					
						
							|  |  |  |         from = from.replace("|", "\\|");
 | 
					
						
							|  |  |  |         from = from.replace("(", "\\(");
 | 
					
						
							|  |  |  |         from = from.replace(")", "\\)");
 | 
					
						
							|  |  |  |         for(Character key : pattern.operations.keySet()){
 | 
					
						
							|  |  |  |             from = from.replace("" + key, "\\" + key);
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  |         return from;
 | 
					
						
							|  |  |  |     }
 | 
					
						
							| 
									
										
										
										
											2017-07-24 17:42:52 -07:00
										 |  |  | }
 |