#include "libregex.h" #include #include #include #include "ll.h" int _regex_node_foreach_free(void* data, va_list args){ free(data); return 0; } libregex_result _regex_node_create_clear(regex_node** node){ libregex_result result = LIBREGEX_SUCCESS; *node = malloc(sizeof(**node)); if(*node){ regex_node_clear(*node); } else { result = LIBREGEX_MALLOC; } return result; } libregex_result _regex_node_create_value(regex_node** node, char value, regex_node* next){ libregex_result result = _regex_node_create_clear(node); if(result == LIBREGEX_SUCCESS){ (*node)->type = REGEX_VALUE; (*node)->data_u.value_s.value = value; (*node)->data_u.value_s.next = next; } return result; } libregex_result _regex_node_create_range(regex_node** node, char from, char to, regex_node* next){ libregex_result result = _regex_node_create_clear(node); if(result == LIBREGEX_SUCCESS){ (*node)->type = REGEX_RANGE; (*node)->data_u.range_s.from = from; (*node)->data_u.range_s.to = to; (*node)->data_u.range_s.next = next; } return result; } libregex_result _regex_node_create_any(regex_node** node, regex_node* next){ libregex_result result = _regex_node_create_clear(node); if(result == LIBREGEX_SUCCESS){ (*node)->type = REGEX_ANY; (*node)->data_u.any_s.next = next; } return result; } libregex_result _regex_node_create_connect(regex_node** node, regex_node* next){ libregex_result result = _regex_node_create_clear(node); if(result == LIBREGEX_SUCCESS){ (*node)->type = REGEX_CONNECT; (*node)->data_u.connect_s.next = next; } return result; } libregex_result _regex_node_create_fork(regex_node** node, regex_node* left, regex_node* right){ libregex_result result = _regex_node_create_clear(node); if(result == LIBREGEX_SUCCESS){ (*node)->type = REGEX_FORK; (*node)->data_u.fork_s.left = left; (*node)->data_u.fork_s.right = right; } return result; } libregex_result _regex_node_create_group(regex_node** open, regex_node** close, int group_id){ libregex_result result = _regex_node_create_clear(open); if(result == LIBREGEX_SUCCESS){ result = _regex_node_create_clear(close); } if(result == LIBREGEX_SUCCESS){ (*open)->type = (*close)->type = REGEX_GROUP; (*open)->data_u.group_s.id = (*close)->data_u.group_s.id = group_id; (*open)->data_u.group_s.other = (*close); (*close)->data_u.group_s.other = (*open); (*open)->data_u.group_s.open = 1; } else { free(*open); free(*close); *open = *close = NULL; } return result; } libregex_result _regex_chain_create(regex_chain** new_chain, regex_node* head, regex_node* tail){ libregex_result result = LIBREGEX_MALLOC; *new_chain = malloc(sizeof(**new_chain)); if(*new_chain){ result = LIBREGEX_SUCCESS; (*new_chain)->head = head; (*new_chain)->tail = tail; } return result; } regex_node** _regex_node_get_next(regex_node* node){ regex_node** to_return = NULL; if(node->type == REGEX_CONNECT){ to_return = &node->data_u.connect_s.next; } else if(node->type == REGEX_VALUE){ to_return = &node->data_u.value_s.next; } else if(node->type == REGEX_RANGE){ to_return = &node->data_u.range_s.next; } else if(node->type == REGEX_ANY){ to_return = &node->data_u.any_s.next; } else if(node->type == REGEX_GROUP) { to_return = &node->data_u.group_s.next; } return to_return; } void _regex_node_append_node(regex_node* append_to, regex_node* to_append){ regex_node** next = _regex_node_get_next(append_to); if(next){ *next = to_append; } } void _regex_chain_append_node(regex_chain* append_to, regex_node* to_append){ if(append_to && to_append){ regex_node** to_set = append_to->head ? _regex_node_get_next(append_to->tail) : &append_to->head; if(to_set) { *to_set = append_to->tail = to_append; } } } void _regex_chain_prepend_node(regex_chain* prepend_to, regex_node* to_prepend){ if(prepend_to && to_prepend){ regex_node** next = _regex_node_get_next(to_prepend); if(next){ *next = prepend_to->head; prepend_to->head = to_prepend; if(prepend_to->tail == NULL){ prepend_to->tail = to_prepend; } } } } void _regex_chain_append_chain(regex_chain* append_to, regex_chain* to_append){ if(append_to && to_append){ _regex_chain_append_node(append_to, to_append->head); if(to_append->tail) { append_to->tail = to_append->tail; } } } libregex_result _regex_find_all(regex_node* node, ll* append_to, int tag_with){ libregex_result result = LIBREGEX_SUCCESS; if(node && node->list_id != tag_with){ node->list_id = tag_with; if(node->type == REGEX_VALUE){ result = _regex_find_all(node->data_u.value_s.next, append_to, tag_with); } else if(node->type == REGEX_RANGE){ result = _regex_find_all(node->data_u.range_s.next, append_to, tag_with); } else if(node->type == REGEX_ANY){ result = _regex_find_all(node->data_u.any_s.next, append_to, tag_with); } else if(node->type == REGEX_GROUP){ result = _regex_find_all(node->data_u.group_s.next, append_to, tag_with); } else if(node->type == REGEX_CONNECT){ result = _regex_find_all(node->data_u.connect_s.next, append_to, tag_with); } else if(node->type == REGEX_FORK){ result = _regex_find_all(node->data_u.fork_s.left, append_to, tag_with); if(result == LIBREGEX_SUCCESS){ result = _regex_find_all(node->data_u.fork_s.right, append_to, tag_with); } } if(result == LIBREGEX_SUCCESS){ result = (ll_append(append_to, node) == LIBDS_SUCCESS) ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC; } } return result; } libregex_result _regex_read_value(char* read_into, char* expression, int* string_index){ libregex_result result = LIBREGEX_SUCCESS; if(expression[*string_index] == '\\'){ (*string_index)++; } if(expression[*string_index]){ *read_into = expression[*string_index]; (*string_index)++; } else { *read_into = '\0'; result = LIBREGEX_INVALID; } return result; } libregex_result _regex_build_or(regex_chain** build_into, char* expression, int* string_index){ regex_node* tail_node = NULL; libregex_result result = _regex_chain_create(build_into, NULL, NULL); if(result == LIBREGEX_SUCCESS){ _regex_node_create_connect(&tail_node, NULL); _regex_chain_append_node(*build_into, tail_node); (*string_index)++; } while(result == LIBREGEX_SUCCESS && expression[*string_index] && expression[*string_index] != ']'){ regex_node* new_node = NULL; regex_node* fork = NULL; regex_node* new_head = NULL; char from = '\0'; char to = '\0'; result = _regex_read_value(&from, expression, string_index); if(result == LIBREGEX_SUCCESS){ if(expression[*string_index] == '-'){ (*string_index)++; result = _regex_read_value(&to, expression, string_index); } } if(result == LIBREGEX_SUCCESS){ result = (to == '\0') ? _regex_node_create_value(&new_node, from, tail_node) : _regex_node_create_range(&new_node, from, to, tail_node); new_head = new_node; } if(result == LIBREGEX_SUCCESS && (*build_into)->head != tail_node){ result = _regex_node_create_fork(&fork, (*build_into)->head, new_node); new_head = fork; } if(result == LIBREGEX_SUCCESS){ (*build_into)->head = new_head; } else { free(fork); free(new_head); } } if(result == LIBREGEX_SUCCESS){ if(expression[*string_index] == ']'){ (*string_index)++; } else { result = LIBREGEX_INVALID; } } if(result != LIBREGEX_SUCCESS){ regex_free((*build_into)->head); free(*build_into); *build_into = NULL; } return result; } int _regex_is_char_op(char c){ return c == '*' || c == '+' || c == '?'; } void _regex_chain_merge(regex_chain** merge_into, regex_chain** to_merge){ _regex_chain_append_chain(*merge_into, *to_merge); free(*to_merge); *to_merge = NULL; } libregex_result _regex_build_chain(regex_chain** build_into, char* expression, int* string_index, int* groups){ regex_node* group_open = NULL; regex_node* group_close = NULL; regex_chain* current_chain = NULL; regex_chain* sub_chain = NULL; ll chain_stack; int is_subchain = *string_index >= 0 && expression[*string_index] == '('; libregex_result result = LIBREGEX_SUCCESS; if(is_subchain){ result = _regex_node_create_group(&group_open, &group_close, ++(*groups)); } if(result == LIBREGEX_SUCCESS){ result = _regex_chain_create(¤t_chain, NULL, NULL); } if(result == LIBREGEX_SUCCESS){ (*string_index)++; } ll_init(&chain_stack); while(expression[*string_index] && expression[*string_index] != ')' && result == LIBREGEX_SUCCESS){ if(_regex_is_char_op(expression[*string_index])){ if(sub_chain){ regex_node* connect = NULL; regex_node* fork = NULL; result = _regex_node_create_connect(&connect, NULL); if(result == LIBREGEX_SUCCESS){ result = _regex_node_create_fork(&fork, sub_chain->head, connect); } if(result == LIBREGEX_SUCCESS){ if(expression[*string_index] == '+' || expression[*string_index] == '*'){ _regex_chain_append_node(sub_chain, fork); if(expression[*string_index] == '*'){ sub_chain->head = fork; } } else { _regex_chain_append_node(sub_chain, connect); sub_chain->head = fork; } sub_chain->tail = connect; _regex_chain_merge(¤t_chain, &sub_chain); (*string_index)++; } else { free(connect); free(fork); } } else { result = LIBREGEX_INVALID; } } else if(expression[*string_index] == '('){ _regex_chain_merge(¤t_chain, &sub_chain); result = _regex_build_chain(&sub_chain, expression, string_index, groups); } else if(expression[*string_index] == '['){ _regex_chain_merge(¤t_chain, &sub_chain); result = _regex_build_or(&sub_chain, expression, string_index); } else if(expression[*string_index] == '|'){ _regex_chain_merge(¤t_chain, &sub_chain); result = ll_append(&chain_stack, current_chain) == LIBDS_SUCCESS ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC; if(result == LIBREGEX_SUCCESS){ result = _regex_chain_create(¤t_chain, NULL, NULL); } (*string_index)++; } else { regex_node* new_node = NULL; _regex_chain_merge(¤t_chain, &sub_chain); if(expression[*string_index] == '.'){ result = _regex_node_create_any(&new_node, NULL); if(result == LIBREGEX_SUCCESS){ (*string_index)++; } } else { char into = '\0'; result = _regex_read_value(&into, expression, string_index); if(result == LIBREGEX_SUCCESS){ result = _regex_node_create_value(&new_node, into, NULL); } } if(result == LIBREGEX_SUCCESS){ result = _regex_chain_create(&sub_chain, new_node, new_node); } if(result != LIBREGEX_SUCCESS){ free(new_node); free(sub_chain); sub_chain = NULL; } } } if(result == LIBREGEX_SUCCESS){ _regex_chain_merge(¤t_chain, &sub_chain); } else { if(sub_chain){ regex_free(sub_chain->head); free(sub_chain); sub_chain = NULL; } } if(result == LIBREGEX_SUCCESS && is_subchain && expression[*string_index] != ')'){ result = LIBREGEX_INVALID; } if(result == LIBREGEX_SUCCESS && chain_stack.head){ if(current_chain->tail){ regex_node* end_node = NULL; result = _regex_node_create_connect(&end_node, NULL); _regex_chain_append_node(current_chain, end_node); while(chain_stack.tail && result == LIBREGEX_SUCCESS){ regex_node* fork = NULL; regex_chain* new_chain = ll_poptail(&chain_stack); result = _regex_node_create_fork(&fork, current_chain->head, new_chain->head); if(result == LIBREGEX_SUCCESS){ _regex_chain_append_node(new_chain, end_node); current_chain->head = fork; } else { if(new_chain){ regex_free(new_chain->head); } } free(new_chain); } } else { result = LIBREGEX_INVALID; } } if(result == LIBREGEX_SUCCESS){ if(is_subchain){ (*string_index)++; } _regex_chain_append_node(current_chain, group_close); _regex_chain_prepend_node(current_chain, group_open); *build_into = current_chain; } else { while(chain_stack.tail){ regex_chain* new_chain = ll_poptail(&chain_stack); if(new_chain){ regex_free(new_chain->head); free(new_chain); } } if(current_chain){ regex_free(current_chain->head); free(current_chain); } free(group_close); free(group_open); *build_into = NULL; } return result; } void regex_node_clear(regex_node* node){ node->type = REGEX_CLEAR; node->list_id = -1; memset(&node->data_u, 0, sizeof(node->data_u)); } libregex_result regex_free(regex_node* root){ libregex_result result; ll found_list; ll_init(&found_list); result = _regex_find_all(root, &found_list, -2); ll_foreach(&found_list, NULL, compare_always, _regex_node_foreach_free); ll_free(&found_list); return result; } libregex_result regex_build(regex_node** root, char* expression){ libregex_result result; regex_chain* building_chain = NULL; int index = -1; int number_groups = 0; regex_node* end_node = NULL; result = _regex_node_create_clear(&end_node); if(result == LIBREGEX_SUCCESS){ end_node->type = REGEX_END; result = _regex_build_chain(&building_chain, expression, &index, &number_groups); } if(result == LIBREGEX_SUCCESS){ _regex_chain_append_node(building_chain, end_node); *root = building_chain->head; free(building_chain); } else { free(end_node); if(building_chain){ regex_free(building_chain->head); free(building_chain); } *root = NULL; } return result; } int _regex_node_matches(regex_node* node, char c){ int matches = 0; if(node->type == REGEX_VALUE){ matches = node->data_u.value_s.value == c; } else if(node->type == REGEX_RANGE){ matches = node->data_u.range_s.from <= c && c <= node->data_u.range_s.to; } else if(node->type == REGEX_ANY){ matches = c != '\0'; } return matches; } void _regex_node_add(regex_node* node, regex_list* list){ if(node && node->list_id < list->id && list->size < LIBREGEX_MAX_NODE_COUNT){ node->list_id = list->id; if(node->type == REGEX_CONNECT){ _regex_node_add(node->data_u.connect_s.next, list); } else if(node->type == REGEX_FORK){ _regex_node_add(node->data_u.fork_s.left, list); _regex_node_add(node->data_u.fork_s.right, list); } else if(!(node->type == REGEX_CLEAR)){ if(node->type == REGEX_GROUP){ _regex_node_add(node->data_u.group_s.next, list); } list->nodes[list->size++] = node; } } } libregex_result _regex_step(regex_sim* sim){ libregex_result result = LIBREGEX_SUCCESS; regex_list* swap_temp; int index = 0; for(; index < sim->current->size; index++){ regex_node* current = sim->current->nodes[index]; if(_regex_node_matches(current, sim->string[sim->index])){ regex_node** next = _regex_node_get_next(current); if(next){ _regex_node_add(*next, sim->next); } } else if(current->type == REGEX_END){ sim->result->matches = 1; } else if(current->type == REGEX_GROUP && sim->string[sim->index - 1 + current->data_u.group_s.open] != '\0'){ int* to_set = (current->data_u.group_s.open) ? &(sim->groups[current->data_u.group_s.id - 1].from) : &(sim->groups[current->data_u.group_s.id - 1].to); *to_set = sim->index - 1 + current->data_u.group_s.open; if(current->data_u.group_s.open == 0 && current->data_u.group_s.id < LIBREGEX_MAX_GROUP_COUNT){ regex_match* new_match = sim->result->groups[current->data_u.group_s.id - 1]; if(new_match == NULL){ new_match = malloc(sizeof(*new_match)); } if(new_match){ new_match->from = sim->groups[current->data_u.group_s.id - 1].from; new_match->to = sim->groups[current->data_u.group_s.id - 1].to; sim->result->groups[current->data_u.group_s.id - 1] = new_match; } else { result = LIBREGEX_MALLOC; } } } } swap_temp = sim->current; sim->current = sim->next; sim->next = swap_temp; sim->next->size = 0; sim->next->id += 2; return result; } libregex_result regex_match_string(regex_node* root, char* string, regex_result* build_result){ ll clear_ll; libregex_result result; regex_sim sim; regex_list list_a; regex_list list_b; list_a.size = 0; list_a.id = 0; list_b.size = 0; list_b.id = 1; build_result->matches = 0; memset(&build_result->groups, 0, sizeof(build_result->groups)); memset(&sim.groups, 0, sizeof(sim.groups)); sim.current = &list_a; sim.next = &list_b; sim.result = build_result; sim.string = string; sim.index = 0; _regex_node_add(root, &list_a); do { result = _regex_step(&sim); } while(sim.string[sim.index++] != '\0' && result == LIBREGEX_SUCCESS); if(result == LIBREGEX_SUCCESS){ ll_init(&clear_ll); result = _regex_find_all(root, &clear_ll, -1); ll_clear(&clear_ll); } return result; } void regex_result_free(regex_result* result){ int index = 0; for(; index < LIBREGEX_MAX_GROUP_COUNT; index++){ free(result->groups[index]); } result->matches = 0; }