Implement construction of regular expression NFAs.

This commit is contained in:
Danila Fedorin 2017-01-07 22:15:35 -08:00
parent a2044b596a
commit 06fe67b226

View File

@ -3,6 +3,11 @@
#include <string.h> #include <string.h>
#include "ll.h" #include "ll.h"
int _regex_node_foreach_free(void* data, va_list args){
free(data);
return 0;
}
libregex_result _regex_node_create_clear(regex_node** node){ libregex_result _regex_node_create_clear(regex_node** node){
libregex_result result = LIBREGEX_SUCCESS; libregex_result result = LIBREGEX_SUCCESS;
*node = malloc(sizeof(**node)); *node = malloc(sizeof(**node));
@ -175,8 +180,301 @@ libregex_result _regex_find_all(regex_node* node, ll* append_to, int tag_with){
return result; return result;
} }
libregex_result _regex_read_value(char* read_into, char* expression, int* string_index){
libregex_result result = LIBREGEX_SUCCESS;
if(expression[*string_index] == '\\'){
(*string_index)++;
}
if(expression[*string_index]){
*read_into = expression[*string_index];
(*string_index)++;
} else {
*read_into = '\0';
result = LIBREGEX_INVALID;
}
return result;
}
libregex_result _regex_build_or(regex_chain** build_into, char* expression, int* string_index){
regex_node* tail_node = NULL;
libregex_result result = _regex_chain_create(build_into, NULL, NULL);
if(result == LIBREGEX_SUCCESS){
_regex_node_create_connect(&tail_node, NULL);
_regex_chain_append_node(*build_into, tail_node);
(*string_index)++;
}
while(result == LIBREGEX_SUCCESS && expression[*string_index] && expression[*string_index] != ']'){
regex_node* new_node = NULL;
regex_node* fork = NULL;
regex_node* new_head = NULL;
char from = '\0';
char to = '\0';
result = _regex_read_value(&from, expression, string_index);
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == '-'){
(*string_index)++;
result = _regex_read_value(&to, expression, string_index);
}
}
if(result == LIBREGEX_SUCCESS){
result = (to == '\0') ? _regex_node_create_value(&new_node, from, tail_node) :
_regex_node_create_range(&new_node, from, to, tail_node);
new_head = new_node;
}
if(result == LIBREGEX_SUCCESS && (*build_into)->head != tail_node){
result = _regex_node_create_fork(&fork, (*build_into)->head, new_node);
new_head = fork;
}
if(result == LIBREGEX_SUCCESS){
(*build_into)->head = new_head;
} else {
free(fork);
free(new_head);
}
}
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == ']'){
(*string_index)++;
} else {
result = LIBREGEX_INVALID;
}
}
if(result != LIBREGEX_SUCCESS){
regex_free((*build_into)->head);
free(*build_into);
*build_into = NULL;
}
return result;
}
int _regex_is_char_op(char c){
return c == '*' || c == '+' || c == '?';
}
void _regex_chain_merge(regex_chain** merge_into, regex_chain** to_merge){
_regex_chain_append_chain(*merge_into, *to_merge);
free(*to_merge);
*to_merge = NULL;
}
libregex_result _regex_build_chain(regex_chain** build_into, char* expression, int* string_index, int* groups){
regex_node* group_open = NULL;
regex_node* group_close = NULL;
regex_chain* current_chain = NULL;
regex_chain* sub_chain = NULL;
ll chain_stack;
int is_subchain = *string_index >= 0 && expression[*string_index] == '(';
libregex_result result = LIBREGEX_SUCCESS;
if(is_subchain){
result = _regex_node_create_group(&group_open, &group_close, ++(*groups));
}
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&current_chain, NULL, NULL);
}
if(result == LIBREGEX_SUCCESS){
(*string_index)++;
}
ll_init(&chain_stack);
while(expression[*string_index] && expression[*string_index] != ')' && result == LIBREGEX_SUCCESS){
if(_regex_is_char_op(expression[*string_index])){
if(sub_chain){
regex_node* connect = NULL;
regex_node* fork = NULL;
result = _regex_node_create_connect(&connect, NULL);
if(result == LIBREGEX_SUCCESS){
result = _regex_node_create_fork(&fork, sub_chain->head, connect);
}
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == '+' || expression[*string_index] == '*'){
_regex_chain_append_node(sub_chain, fork);
if(expression[*string_index] == '*'){
sub_chain->head = fork;
}
} else {
_regex_chain_append_node(sub_chain, connect);
sub_chain->head = fork;
}
sub_chain->tail = connect;
_regex_chain_merge(&current_chain, &sub_chain);
(*string_index)++;
} else {
free(connect);
free(fork);
}
} else {
result = LIBREGEX_INVALID;
}
} else if(expression[*string_index] == '('){
_regex_chain_merge(&current_chain, &sub_chain);
result = _regex_build_chain(&sub_chain, expression, string_index, groups);
} else if(expression[*string_index] == '['){
_regex_chain_merge(&current_chain, &sub_chain);
result = _regex_build_or(&sub_chain, expression, string_index);
} else if(expression[*string_index] == '|'){
_regex_chain_merge(&current_chain, &sub_chain);
result = ll_append(&chain_stack, current_chain) == LIBDS_SUCCESS ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC;
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&current_chain, NULL, NULL);
}
(*string_index)++;
} else {
regex_node* new_node = NULL;
_regex_chain_merge(&current_chain, &sub_chain);
if(expression[*string_index] == '.'){
result = _regex_node_create_any(&new_node, NULL);
if(result == LIBREGEX_SUCCESS){
(*string_index)++;
}
} else {
char into = '\0';
result = _regex_read_value(&into, expression, string_index);
if(result == LIBREGEX_SUCCESS){
result = _regex_node_create_value(&new_node, into, NULL);
}
}
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&sub_chain, new_node, new_node);
}
if(result != LIBREGEX_SUCCESS){
free(new_node);
free(sub_chain);
sub_chain = NULL;
}
}
}
if(result == LIBREGEX_SUCCESS){
_regex_chain_merge(&current_chain, &sub_chain);
} else {
if(sub_chain){
regex_free(sub_chain->head);
free(sub_chain);
sub_chain = NULL;
}
}
if(result == LIBREGEX_SUCCESS && is_subchain && expression[*string_index] != ')'){
result = LIBREGEX_INVALID;
}
if(result == LIBREGEX_SUCCESS && chain_stack.head){
if(current_chain->tail){
regex_node* end_node = NULL;
result = _regex_node_create_connect(&end_node, NULL);
_regex_chain_append_node(current_chain, end_node);
while(chain_stack.tail && result == LIBREGEX_SUCCESS){
regex_node* fork = NULL;
regex_chain* new_chain = ll_poptail(&chain_stack);
result = _regex_node_create_fork(&fork, current_chain->head, new_chain->head);
if(result == LIBREGEX_SUCCESS){
_regex_chain_append_node(new_chain, end_node);
current_chain->head = fork;
} else {
if(new_chain){
regex_free(new_chain->head);
}
}
free(new_chain);
}
} else {
result = LIBREGEX_INVALID;
}
}
if(result == LIBREGEX_SUCCESS){
if(is_subchain){
(*string_index)++;
}
_regex_chain_append_node(current_chain, group_close);
_regex_chain_prepend_node(current_chain, group_open);
*build_into = current_chain;
} else {
while(chain_stack.tail){
regex_chain* new_chain = ll_poptail(&chain_stack);
if(new_chain){
regex_free(new_chain->head);
free(new_chain);
}
}
if(current_chain){
regex_free(current_chain->head);
free(current_chain);
}
free(group_close);
free(group_open);
*build_into = NULL;
}
return result;
}
void regex_node_clear(regex_node* node){ void regex_node_clear(regex_node* node){
node->type = REGEX_CLEAR; node->type = REGEX_CLEAR;
node->list_id = -1; node->list_id = -1;
memset(&node->data_u, 0, sizeof(node->data_u)); memset(&node->data_u, 0, sizeof(node->data_u));
} }
libregex_result regex_free(regex_node* root){
libregex_result result;
ll found_list;
ll_init(&found_list);
result = _regex_find_all(root, &found_list, -2);
ll_foreach(&found_list, NULL, compare_always, _regex_node_foreach_free);
ll_free(&found_list);
return result;
}
libregex_result regex_build(regex_node** root, char* expression){
libregex_result result;
regex_chain* building_chain = NULL;
int index = -1;
int number_groups = 0;
regex_node* end_node = NULL;
result = _regex_node_create_clear(&end_node);
if(result == LIBREGEX_SUCCESS){
end_node->type = REGEX_END;
result = _regex_build_chain(&building_chain, expression, &index, &number_groups);
}
if(result == LIBREGEX_SUCCESS){
_regex_chain_append_node(building_chain, end_node);
*root = building_chain->head;
free(building_chain);
} else {
free(end_node);
if(building_chain){
regex_free(building_chain->head);
free(building_chain);
}
*root = NULL;
}
return result;
}
libregex_result regex_match_string(regex_node* root, char* string, regex_result* result);