libregex/src/libregex.c

602 lines
18 KiB
C

#include "libregex.h"
#include <stdlib.h>
#include <string.h>
#include <libregex.h>
#include "ll.h"
int _regex_node_foreach_free(void* data, va_list args){
free(data);
return 0;
}
libregex_result _regex_node_create_clear(regex_node** node){
libregex_result result = LIBREGEX_SUCCESS;
*node = malloc(sizeof(**node));
if(*node){
regex_node_clear(*node);
} else {
result = LIBREGEX_MALLOC;
}
return result;
}
libregex_result _regex_node_create_value(regex_node** node, char value, regex_node* next){
libregex_result result = _regex_node_create_clear(node);
if(result == LIBREGEX_SUCCESS){
(*node)->type = REGEX_VALUE;
(*node)->data_u.value_s.value = value;
(*node)->data_u.value_s.next = next;
}
return result;
}
libregex_result _regex_node_create_range(regex_node** node, char from, char to, regex_node* next){
libregex_result result = _regex_node_create_clear(node);
if(result == LIBREGEX_SUCCESS){
(*node)->type = REGEX_RANGE;
(*node)->data_u.range_s.from = from;
(*node)->data_u.range_s.to = to;
(*node)->data_u.range_s.next = next;
}
return result;
}
libregex_result _regex_node_create_any(regex_node** node, regex_node* next){
libregex_result result = _regex_node_create_clear(node);
if(result == LIBREGEX_SUCCESS){
(*node)->type = REGEX_ANY;
(*node)->data_u.any_s.next = next;
}
return result;
}
libregex_result _regex_node_create_connect(regex_node** node, regex_node* next){
libregex_result result = _regex_node_create_clear(node);
if(result == LIBREGEX_SUCCESS){
(*node)->type = REGEX_CONNECT;
(*node)->data_u.connect_s.next = next;
}
return result;
}
libregex_result _regex_node_create_fork(regex_node** node, regex_node* left, regex_node* right){
libregex_result result = _regex_node_create_clear(node);
if(result == LIBREGEX_SUCCESS){
(*node)->type = REGEX_FORK;
(*node)->data_u.fork_s.left = left;
(*node)->data_u.fork_s.right = right;
}
return result;
}
libregex_result _regex_node_create_group(regex_node** open, regex_node** close, int group_id){
libregex_result result = _regex_node_create_clear(open);
if(result == LIBREGEX_SUCCESS){
result = _regex_node_create_clear(close);
}
if(result == LIBREGEX_SUCCESS){
(*open)->type = (*close)->type = REGEX_GROUP;
(*open)->data_u.group_s.id = (*close)->data_u.group_s.id = group_id;
(*open)->data_u.group_s.other = (*close);
(*close)->data_u.group_s.other = (*open);
(*open)->data_u.group_s.open = 1;
} else {
free(*open);
free(*close);
*open = *close = NULL;
}
return result;
}
libregex_result _regex_chain_create(regex_chain** new_chain, regex_node* head, regex_node* tail){
libregex_result result = LIBREGEX_MALLOC;
*new_chain = malloc(sizeof(**new_chain));
if(*new_chain){
result = LIBREGEX_SUCCESS;
(*new_chain)->head = head;
(*new_chain)->tail = tail;
}
return result;
}
regex_node** _regex_node_get_next(regex_node* node){
regex_node** to_return = NULL;
if(node->type == REGEX_CONNECT){
to_return = &node->data_u.connect_s.next;
} else if(node->type == REGEX_VALUE){
to_return = &node->data_u.value_s.next;
} else if(node->type == REGEX_RANGE){
to_return = &node->data_u.range_s.next;
} else if(node->type == REGEX_ANY){
to_return = &node->data_u.any_s.next;
} else if(node->type == REGEX_GROUP) {
to_return = &node->data_u.group_s.next;
}
return to_return;
}
void _regex_node_append_node(regex_node* append_to, regex_node* to_append){
regex_node** next = _regex_node_get_next(append_to);
if(next){
*next = to_append;
}
}
void _regex_chain_append_node(regex_chain* append_to, regex_node* to_append){
if(append_to && to_append){
regex_node** to_set = append_to->head ? _regex_node_get_next(append_to->tail) : &append_to->head;
if(to_set) {
*to_set = append_to->tail = to_append;
}
}
}
void _regex_chain_prepend_node(regex_chain* prepend_to, regex_node* to_prepend){
if(prepend_to && to_prepend){
regex_node** next = _regex_node_get_next(to_prepend);
if(next){
*next = prepend_to->head;
prepend_to->head = to_prepend;
if(prepend_to->tail == NULL){
prepend_to->tail = to_prepend;
}
}
}
}
void _regex_chain_append_chain(regex_chain* append_to, regex_chain* to_append){
if(append_to && to_append){
_regex_chain_append_node(append_to, to_append->head);
if(to_append->tail) {
append_to->tail = to_append->tail;
}
}
}
libregex_result _regex_find_all(regex_node* node, ll* append_to, int tag_with){
libregex_result result = LIBREGEX_SUCCESS;
if(node && node->list_id != tag_with){
node->list_id = tag_with;
if(node->type == REGEX_VALUE){
result = _regex_find_all(node->data_u.value_s.next, append_to, tag_with);
} else if(node->type == REGEX_RANGE){
result = _regex_find_all(node->data_u.range_s.next, append_to, tag_with);
} else if(node->type == REGEX_ANY){
result = _regex_find_all(node->data_u.any_s.next, append_to, tag_with);
} else if(node->type == REGEX_GROUP){
result = _regex_find_all(node->data_u.group_s.next, append_to, tag_with);
} else if(node->type == REGEX_CONNECT){
result = _regex_find_all(node->data_u.connect_s.next, append_to, tag_with);
} else if(node->type == REGEX_FORK){
result = _regex_find_all(node->data_u.fork_s.left, append_to, tag_with);
if(result == LIBREGEX_SUCCESS){
result = _regex_find_all(node->data_u.fork_s.right, append_to, tag_with);
}
}
if(result == LIBREGEX_SUCCESS){
result = (ll_append(append_to, node) == LIBDS_SUCCESS) ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC;
}
}
return result;
}
libregex_result _regex_read_value(char* read_into, char* expression, int* string_index){
libregex_result result = LIBREGEX_SUCCESS;
if(expression[*string_index] == '\\'){
(*string_index)++;
}
if(expression[*string_index]){
*read_into = expression[*string_index];
(*string_index)++;
} else {
*read_into = '\0';
result = LIBREGEX_INVALID;
}
return result;
}
libregex_result _regex_build_or(regex_chain** build_into, char* expression, int* string_index){
regex_node* tail_node = NULL;
libregex_result result = _regex_chain_create(build_into, NULL, NULL);
if(result == LIBREGEX_SUCCESS){
_regex_node_create_connect(&tail_node, NULL);
_regex_chain_append_node(*build_into, tail_node);
(*string_index)++;
}
while(result == LIBREGEX_SUCCESS && expression[*string_index] && expression[*string_index] != ']'){
regex_node* new_node = NULL;
regex_node* fork = NULL;
regex_node* new_head = NULL;
char from = '\0';
char to = '\0';
result = _regex_read_value(&from, expression, string_index);
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == '-'){
(*string_index)++;
result = _regex_read_value(&to, expression, string_index);
}
}
if(result == LIBREGEX_SUCCESS){
result = (to == '\0') ? _regex_node_create_value(&new_node, from, tail_node) :
_regex_node_create_range(&new_node, from, to, tail_node);
new_head = new_node;
}
if(result == LIBREGEX_SUCCESS && (*build_into)->head != tail_node){
result = _regex_node_create_fork(&fork, (*build_into)->head, new_node);
new_head = fork;
}
if(result == LIBREGEX_SUCCESS){
(*build_into)->head = new_head;
} else {
free(fork);
free(new_head);
}
}
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == ']'){
(*string_index)++;
} else {
result = LIBREGEX_INVALID;
}
}
if(result != LIBREGEX_SUCCESS){
regex_free((*build_into)->head);
free(*build_into);
*build_into = NULL;
}
return result;
}
int _regex_is_char_op(char c){
return c == '*' || c == '+' || c == '?';
}
void _regex_chain_merge(regex_chain** merge_into, regex_chain** to_merge){
_regex_chain_append_chain(*merge_into, *to_merge);
free(*to_merge);
*to_merge = NULL;
}
libregex_result _regex_build_chain(regex_chain** build_into, char* expression, int* string_index, int* groups){
regex_node* group_open = NULL;
regex_node* group_close = NULL;
regex_chain* current_chain = NULL;
regex_chain* sub_chain = NULL;
ll chain_stack;
int is_subchain = *string_index >= 0 && expression[*string_index] == '(';
libregex_result result = LIBREGEX_SUCCESS;
if(is_subchain){
result = _regex_node_create_group(&group_open, &group_close, ++(*groups));
}
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&current_chain, NULL, NULL);
}
if(result == LIBREGEX_SUCCESS){
(*string_index)++;
}
ll_init(&chain_stack);
while(expression[*string_index] && expression[*string_index] != ')' && result == LIBREGEX_SUCCESS){
if(_regex_is_char_op(expression[*string_index])){
if(sub_chain){
regex_node* connect = NULL;
regex_node* fork = NULL;
result = _regex_node_create_connect(&connect, NULL);
if(result == LIBREGEX_SUCCESS){
result = _regex_node_create_fork(&fork, sub_chain->head, connect);
}
if(result == LIBREGEX_SUCCESS){
if(expression[*string_index] == '+' || expression[*string_index] == '*'){
_regex_chain_append_node(sub_chain, fork);
if(expression[*string_index] == '*'){
sub_chain->head = fork;
}
} else {
_regex_chain_append_node(sub_chain, connect);
sub_chain->head = fork;
}
sub_chain->tail = connect;
_regex_chain_merge(&current_chain, &sub_chain);
(*string_index)++;
} else {
free(connect);
free(fork);
}
} else {
result = LIBREGEX_INVALID;
}
} else if(expression[*string_index] == '('){
_regex_chain_merge(&current_chain, &sub_chain);
result = _regex_build_chain(&sub_chain, expression, string_index, groups);
} else if(expression[*string_index] == '['){
_regex_chain_merge(&current_chain, &sub_chain);
result = _regex_build_or(&sub_chain, expression, string_index);
} else if(expression[*string_index] == '|'){
_regex_chain_merge(&current_chain, &sub_chain);
result = ll_append(&chain_stack, current_chain) == LIBDS_SUCCESS ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC;
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&current_chain, NULL, NULL);
}
(*string_index)++;
} else {
regex_node* new_node = NULL;
_regex_chain_merge(&current_chain, &sub_chain);
if(expression[*string_index] == '.'){
result = _regex_node_create_any(&new_node, NULL);
if(result == LIBREGEX_SUCCESS){
(*string_index)++;
}
} else {
char into = '\0';
result = _regex_read_value(&into, expression, string_index);
if(result == LIBREGEX_SUCCESS){
result = _regex_node_create_value(&new_node, into, NULL);
}
}
if(result == LIBREGEX_SUCCESS){
result = _regex_chain_create(&sub_chain, new_node, new_node);
}
if(result != LIBREGEX_SUCCESS){
free(new_node);
free(sub_chain);
sub_chain = NULL;
}
}
}
if(result == LIBREGEX_SUCCESS){
_regex_chain_merge(&current_chain, &sub_chain);
} else {
if(sub_chain){
regex_free(sub_chain->head);
free(sub_chain);
sub_chain = NULL;
}
}
if(result == LIBREGEX_SUCCESS && is_subchain && expression[*string_index] != ')'){
result = LIBREGEX_INVALID;
}
if(result == LIBREGEX_SUCCESS && chain_stack.head){
if(current_chain->tail){
regex_node* end_node = NULL;
result = _regex_node_create_connect(&end_node, NULL);
_regex_chain_append_node(current_chain, end_node);
while(chain_stack.tail && result == LIBREGEX_SUCCESS){
regex_node* fork = NULL;
regex_chain* new_chain = ll_poptail(&chain_stack);
result = _regex_node_create_fork(&fork, current_chain->head, new_chain->head);
if(result == LIBREGEX_SUCCESS){
_regex_chain_append_node(new_chain, end_node);
current_chain->head = fork;
} else {
if(new_chain){
regex_free(new_chain->head);
}
}
free(new_chain);
}
} else {
result = LIBREGEX_INVALID;
}
}
if(result == LIBREGEX_SUCCESS){
if(is_subchain){
(*string_index)++;
}
_regex_chain_append_node(current_chain, group_close);
_regex_chain_prepend_node(current_chain, group_open);
*build_into = current_chain;
} else {
while(chain_stack.tail){
regex_chain* new_chain = ll_poptail(&chain_stack);
if(new_chain){
regex_free(new_chain->head);
free(new_chain);
}
}
if(current_chain){
regex_free(current_chain->head);
free(current_chain);
}
free(group_close);
free(group_open);
*build_into = NULL;
}
return result;
}
void regex_node_clear(regex_node* node){
node->type = REGEX_CLEAR;
node->list_id = -1;
memset(&node->data_u, 0, sizeof(node->data_u));
}
libregex_result regex_free(regex_node* root){
libregex_result result;
ll found_list;
ll_init(&found_list);
result = _regex_find_all(root, &found_list, -2);
ll_foreach(&found_list, NULL, compare_always, _regex_node_foreach_free);
ll_free(&found_list);
return result;
}
libregex_result regex_build(regex_node** root, char* expression){
libregex_result result;
regex_chain* building_chain = NULL;
int index = -1;
int number_groups = 0;
regex_node* end_node = NULL;
result = _regex_node_create_clear(&end_node);
if(result == LIBREGEX_SUCCESS){
end_node->type = REGEX_END;
result = _regex_build_chain(&building_chain, expression, &index, &number_groups);
}
if(result == LIBREGEX_SUCCESS){
_regex_chain_append_node(building_chain, end_node);
*root = building_chain->head;
free(building_chain);
} else {
free(end_node);
if(building_chain){
regex_free(building_chain->head);
free(building_chain);
}
*root = NULL;
}
return result;
}
int _regex_node_matches(regex_node* node, char c){
int matches = 0;
if(node->type == REGEX_VALUE){
matches = node->data_u.value_s.value == c;
} else if(node->type == REGEX_RANGE){
matches = node->data_u.range_s.from <= c && c <= node->data_u.range_s.to;
} else if(node->type == REGEX_ANY){
matches = c != '\0';
}
return matches;
}
void _regex_node_add(regex_node* node, regex_list* list){
if(node && node->list_id < list->id && list->size < LIBREGEX_MAX_NODE_COUNT){
node->list_id = list->id;
if(node->type == REGEX_CONNECT){
_regex_node_add(node->data_u.connect_s.next, list);
} else if(node->type == REGEX_FORK){
_regex_node_add(node->data_u.fork_s.left, list);
_regex_node_add(node->data_u.fork_s.right, list);
} else if(!(node->type == REGEX_CLEAR)){
if(node->type == REGEX_GROUP){
_regex_node_add(node->data_u.group_s.next, list);
}
list->nodes[list->size++] = node;
}
}
}
libregex_result _regex_step(regex_sim* sim){
libregex_result result = LIBREGEX_SUCCESS;
regex_list* swap_temp;
int index = 0;
for(; index < sim->current->size; index++){
regex_node* current = sim->current->nodes[index];
if(_regex_node_matches(current, sim->string[sim->index])){
regex_node** next = _regex_node_get_next(current);
if(next){
_regex_node_add(*next, sim->next);
}
} else if(current->type == REGEX_END){
sim->result->matches = 1;
} else if(current->type == REGEX_GROUP && sim->string[sim->index - 1 + current->data_u.group_s.open] != '\0'){
int* to_set = (current->data_u.group_s.open) ?
&(sim->groups[current->data_u.group_s.id - 1].from) :
&(sim->groups[current->data_u.group_s.id - 1].to);
*to_set = sim->index - 1 + current->data_u.group_s.open;
if(current->data_u.group_s.open == 0 && current->data_u.group_s.id < LIBREGEX_MAX_GROUP_COUNT){
regex_match* new_match = sim->result->groups[current->data_u.group_s.id - 1];
if(new_match == NULL){
new_match = malloc(sizeof(*new_match));
}
if(new_match){
new_match->from = sim->groups[current->data_u.group_s.id - 1].from;
new_match->to = sim->groups[current->data_u.group_s.id - 1].to;
sim->result->groups[current->data_u.group_s.id - 1] = new_match;
} else {
result = LIBREGEX_MALLOC;
}
}
}
}
swap_temp = sim->current;
sim->current = sim->next;
sim->next = swap_temp;
sim->next->size = 0;
sim->next->id += 2;
return result;
}
libregex_result regex_match_string(regex_node* root, char* string, regex_result* build_result){
ll clear_ll;
libregex_result result;
regex_sim sim;
regex_list list_a;
regex_list list_b;
list_a.size = 0;
list_a.id = 0;
list_b.size = 0;
list_b.id = 1;
build_result->matches = 0;
memset(&build_result->groups, 0, sizeof(build_result->groups));
memset(&sim.groups, 0, sizeof(sim.groups));
sim.current = &list_a;
sim.next = &list_b;
sim.result = build_result;
sim.string = string;
sim.index = 0;
_regex_node_add(root, &list_a);
do {
result = _regex_step(&sim);
} while(sim.string[sim.index++] != '\0' && result == LIBREGEX_SUCCESS);
if(result == LIBREGEX_SUCCESS){
ll_init(&clear_ll);
result = _regex_find_all(root, &clear_ll, -1);
ll_clear(&clear_ll);
}
return result;
}
void regex_result_free(regex_result* result){
int index = 0;
for(; index < LIBREGEX_MAX_GROUP_COUNT; index++){
free(result->groups[index]);
}
result->matches = 0;
}