From 06fe67b22622c59dc6eb561fe4145842ff83a202 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Sat, 7 Jan 2017 22:15:35 -0800 Subject: [PATCH] Implement construction of regular expression NFAs. --- src/libregex.c | 298 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) diff --git a/src/libregex.c b/src/libregex.c index 68eda3c..f485e4b 100644 --- a/src/libregex.c +++ b/src/libregex.c @@ -3,6 +3,11 @@ #include #include "ll.h" +int _regex_node_foreach_free(void* data, va_list args){ + free(data); + return 0; +} + libregex_result _regex_node_create_clear(regex_node** node){ libregex_result result = LIBREGEX_SUCCESS; *node = malloc(sizeof(**node)); @@ -175,8 +180,301 @@ libregex_result _regex_find_all(regex_node* node, ll* append_to, int tag_with){ return result; } +libregex_result _regex_read_value(char* read_into, char* expression, int* string_index){ + libregex_result result = LIBREGEX_SUCCESS; + if(expression[*string_index] == '\\'){ + (*string_index)++; + } + if(expression[*string_index]){ + *read_into = expression[*string_index]; + (*string_index)++; + } else { + *read_into = '\0'; + result = LIBREGEX_INVALID; + } + return result; +} + +libregex_result _regex_build_or(regex_chain** build_into, char* expression, int* string_index){ + regex_node* tail_node = NULL; + libregex_result result = _regex_chain_create(build_into, NULL, NULL); + if(result == LIBREGEX_SUCCESS){ + _regex_node_create_connect(&tail_node, NULL); + _regex_chain_append_node(*build_into, tail_node); + (*string_index)++; + } + + while(result == LIBREGEX_SUCCESS && expression[*string_index] && expression[*string_index] != ']'){ + regex_node* new_node = NULL; + regex_node* fork = NULL; + regex_node* new_head = NULL; + char from = '\0'; + char to = '\0'; + + result = _regex_read_value(&from, expression, string_index); + if(result == LIBREGEX_SUCCESS){ + if(expression[*string_index] == '-'){ + (*string_index)++; + result = _regex_read_value(&to, expression, string_index); + } + } + + if(result == LIBREGEX_SUCCESS){ + result = (to == '\0') ? _regex_node_create_value(&new_node, from, tail_node) : + _regex_node_create_range(&new_node, from, to, tail_node); + new_head = new_node; + } + + if(result == LIBREGEX_SUCCESS && (*build_into)->head != tail_node){ + result = _regex_node_create_fork(&fork, (*build_into)->head, new_node); + new_head = fork; + } + + if(result == LIBREGEX_SUCCESS){ + (*build_into)->head = new_head; + } else { + free(fork); + free(new_head); + } + } + + if(result == LIBREGEX_SUCCESS){ + if(expression[*string_index] == ']'){ + (*string_index)++; + } else { + result = LIBREGEX_INVALID; + } + } + + if(result != LIBREGEX_SUCCESS){ + regex_free((*build_into)->head); + free(*build_into); + *build_into = NULL; + } + + return result; +} + +int _regex_is_char_op(char c){ + return c == '*' || c == '+' || c == '?'; +} + +void _regex_chain_merge(regex_chain** merge_into, regex_chain** to_merge){ + _regex_chain_append_chain(*merge_into, *to_merge); + free(*to_merge); + *to_merge = NULL; +} + +libregex_result _regex_build_chain(regex_chain** build_into, char* expression, int* string_index, int* groups){ + regex_node* group_open = NULL; + regex_node* group_close = NULL; + regex_chain* current_chain = NULL; + regex_chain* sub_chain = NULL; + ll chain_stack; + int is_subchain = *string_index >= 0 && expression[*string_index] == '('; + libregex_result result = LIBREGEX_SUCCESS; + + if(is_subchain){ + result = _regex_node_create_group(&group_open, &group_close, ++(*groups)); + } + + if(result == LIBREGEX_SUCCESS){ + result = _regex_chain_create(¤t_chain, NULL, NULL); + } + + if(result == LIBREGEX_SUCCESS){ + (*string_index)++; + } + + ll_init(&chain_stack); + + while(expression[*string_index] && expression[*string_index] != ')' && result == LIBREGEX_SUCCESS){ + if(_regex_is_char_op(expression[*string_index])){ + if(sub_chain){ + regex_node* connect = NULL; + regex_node* fork = NULL; + + result = _regex_node_create_connect(&connect, NULL); + if(result == LIBREGEX_SUCCESS){ + result = _regex_node_create_fork(&fork, sub_chain->head, connect); + } + + if(result == LIBREGEX_SUCCESS){ + if(expression[*string_index] == '+' || expression[*string_index] == '*'){ + _regex_chain_append_node(sub_chain, fork); + if(expression[*string_index] == '*'){ + sub_chain->head = fork; + } + } else { + _regex_chain_append_node(sub_chain, connect); + sub_chain->head = fork; + } + sub_chain->tail = connect; + + _regex_chain_merge(¤t_chain, &sub_chain); + (*string_index)++; + } else { + free(connect); + free(fork); + } + } else { + result = LIBREGEX_INVALID; + } + } else if(expression[*string_index] == '('){ + _regex_chain_merge(¤t_chain, &sub_chain); + + result = _regex_build_chain(&sub_chain, expression, string_index, groups); + } else if(expression[*string_index] == '['){ + _regex_chain_merge(¤t_chain, &sub_chain); + + result = _regex_build_or(&sub_chain, expression, string_index); + } else if(expression[*string_index] == '|'){ + _regex_chain_merge(¤t_chain, &sub_chain); + + result = ll_append(&chain_stack, current_chain) == LIBDS_SUCCESS ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC; + if(result == LIBREGEX_SUCCESS){ + result = _regex_chain_create(¤t_chain, NULL, NULL); + } + + (*string_index)++; + } else { + regex_node* new_node = NULL; + + _regex_chain_merge(¤t_chain, &sub_chain); + + if(expression[*string_index] == '.'){ + result = _regex_node_create_any(&new_node, NULL); + if(result == LIBREGEX_SUCCESS){ + (*string_index)++; + } + } else { + char into = '\0'; + result = _regex_read_value(&into, expression, string_index); + if(result == LIBREGEX_SUCCESS){ + result = _regex_node_create_value(&new_node, into, NULL); + } + } + + if(result == LIBREGEX_SUCCESS){ + result = _regex_chain_create(&sub_chain, new_node, new_node); + } + + if(result != LIBREGEX_SUCCESS){ + free(new_node); + free(sub_chain); + sub_chain = NULL; + } + } + } + + if(result == LIBREGEX_SUCCESS){ + _regex_chain_merge(¤t_chain, &sub_chain); + } else { + if(sub_chain){ + regex_free(sub_chain->head); + free(sub_chain); + sub_chain = NULL; + } + } + + if(result == LIBREGEX_SUCCESS && is_subchain && expression[*string_index] != ')'){ + result = LIBREGEX_INVALID; + } + + if(result == LIBREGEX_SUCCESS && chain_stack.head){ + if(current_chain->tail){ + regex_node* end_node = NULL; + result = _regex_node_create_connect(&end_node, NULL); + _regex_chain_append_node(current_chain, end_node); + + + while(chain_stack.tail && result == LIBREGEX_SUCCESS){ + regex_node* fork = NULL; + regex_chain* new_chain = ll_poptail(&chain_stack); + result = _regex_node_create_fork(&fork, current_chain->head, new_chain->head); + + if(result == LIBREGEX_SUCCESS){ + _regex_chain_append_node(new_chain, end_node); + current_chain->head = fork; + } else { + if(new_chain){ + regex_free(new_chain->head); + } + } + + free(new_chain); + } + } else { + result = LIBREGEX_INVALID; + } + } + + if(result == LIBREGEX_SUCCESS){ + if(is_subchain){ + (*string_index)++; + } + _regex_chain_append_node(current_chain, group_close); + _regex_chain_prepend_node(current_chain, group_open); + *build_into = current_chain; + } else { + while(chain_stack.tail){ + regex_chain* new_chain = ll_poptail(&chain_stack); + if(new_chain){ + regex_free(new_chain->head); + free(new_chain); + } + } + + if(current_chain){ + regex_free(current_chain->head); + free(current_chain); + } + free(group_close); + free(group_open); + *build_into = NULL; + } + + return result; +} + void regex_node_clear(regex_node* node){ node->type = REGEX_CLEAR; node->list_id = -1; memset(&node->data_u, 0, sizeof(node->data_u)); } +libregex_result regex_free(regex_node* root){ + libregex_result result; + ll found_list; + ll_init(&found_list); + result = _regex_find_all(root, &found_list, -2); + ll_foreach(&found_list, NULL, compare_always, _regex_node_foreach_free); + ll_free(&found_list); + return result; +} +libregex_result regex_build(regex_node** root, char* expression){ + libregex_result result; + regex_chain* building_chain = NULL; + int index = -1; + int number_groups = 0; + regex_node* end_node = NULL; + result = _regex_node_create_clear(&end_node); + if(result == LIBREGEX_SUCCESS){ + end_node->type = REGEX_END; + result = _regex_build_chain(&building_chain, expression, &index, &number_groups); + } + if(result == LIBREGEX_SUCCESS){ + _regex_chain_append_node(building_chain, end_node); + *root = building_chain->head; + free(building_chain); + } else { + free(end_node); + if(building_chain){ + regex_free(building_chain->head); + free(building_chain); + } + *root = NULL; + } + + return result; +} +libregex_result regex_match_string(regex_node* root, char* string, regex_result* result); \ No newline at end of file