602 lines
18 KiB
C
602 lines
18 KiB
C
#include "libregex.h"
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <libregex.h>
|
|
#include "ll.h"
|
|
|
|
int _regex_node_foreach_free(void* data, va_list args){
|
|
free(data);
|
|
return 0;
|
|
}
|
|
|
|
libregex_result _regex_node_create_clear(regex_node** node){
|
|
libregex_result result = LIBREGEX_SUCCESS;
|
|
*node = malloc(sizeof(**node));
|
|
if(*node){
|
|
regex_node_clear(*node);
|
|
} else {
|
|
result = LIBREGEX_MALLOC;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_value(regex_node** node, char value, regex_node* next){
|
|
libregex_result result = _regex_node_create_clear(node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*node)->type = REGEX_VALUE;
|
|
(*node)->data_u.value_s.value = value;
|
|
(*node)->data_u.value_s.next = next;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_range(regex_node** node, char from, char to, regex_node* next){
|
|
libregex_result result = _regex_node_create_clear(node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*node)->type = REGEX_RANGE;
|
|
(*node)->data_u.range_s.from = from;
|
|
(*node)->data_u.range_s.to = to;
|
|
(*node)->data_u.range_s.next = next;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_any(regex_node** node, regex_node* next){
|
|
libregex_result result = _regex_node_create_clear(node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*node)->type = REGEX_ANY;
|
|
(*node)->data_u.any_s.next = next;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_connect(regex_node** node, regex_node* next){
|
|
libregex_result result = _regex_node_create_clear(node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*node)->type = REGEX_CONNECT;
|
|
(*node)->data_u.connect_s.next = next;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_fork(regex_node** node, regex_node* left, regex_node* right){
|
|
libregex_result result = _regex_node_create_clear(node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*node)->type = REGEX_FORK;
|
|
(*node)->data_u.fork_s.left = left;
|
|
(*node)->data_u.fork_s.right = right;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_node_create_group(regex_node** open, regex_node** close, int group_id){
|
|
libregex_result result = _regex_node_create_clear(open);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_node_create_clear(close);
|
|
}
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*open)->type = (*close)->type = REGEX_GROUP;
|
|
(*open)->data_u.group_s.id = (*close)->data_u.group_s.id = group_id;
|
|
(*open)->data_u.group_s.other = (*close);
|
|
(*close)->data_u.group_s.other = (*open);
|
|
|
|
(*open)->data_u.group_s.open = 1;
|
|
} else {
|
|
free(*open);
|
|
free(*close);
|
|
*open = *close = NULL;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_chain_create(regex_chain** new_chain, regex_node* head, regex_node* tail){
|
|
libregex_result result = LIBREGEX_MALLOC;
|
|
*new_chain = malloc(sizeof(**new_chain));
|
|
if(*new_chain){
|
|
result = LIBREGEX_SUCCESS;
|
|
(*new_chain)->head = head;
|
|
(*new_chain)->tail = tail;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
regex_node** _regex_node_get_next(regex_node* node){
|
|
regex_node** to_return = NULL;
|
|
if(node->type == REGEX_CONNECT){
|
|
to_return = &node->data_u.connect_s.next;
|
|
} else if(node->type == REGEX_VALUE){
|
|
to_return = &node->data_u.value_s.next;
|
|
} else if(node->type == REGEX_RANGE){
|
|
to_return = &node->data_u.range_s.next;
|
|
} else if(node->type == REGEX_ANY){
|
|
to_return = &node->data_u.any_s.next;
|
|
} else if(node->type == REGEX_GROUP) {
|
|
to_return = &node->data_u.group_s.next;
|
|
}
|
|
return to_return;
|
|
}
|
|
|
|
void _regex_node_append_node(regex_node* append_to, regex_node* to_append){
|
|
regex_node** next = _regex_node_get_next(append_to);
|
|
if(next){
|
|
*next = to_append;
|
|
}
|
|
}
|
|
|
|
void _regex_chain_append_node(regex_chain* append_to, regex_node* to_append){
|
|
if(append_to && to_append){
|
|
regex_node** to_set = append_to->head ? _regex_node_get_next(append_to->tail) : &append_to->head;
|
|
if(to_set) {
|
|
*to_set = append_to->tail = to_append;
|
|
}
|
|
}
|
|
}
|
|
|
|
void _regex_chain_prepend_node(regex_chain* prepend_to, regex_node* to_prepend){
|
|
if(prepend_to && to_prepend){
|
|
regex_node** next = _regex_node_get_next(to_prepend);
|
|
if(next){
|
|
*next = prepend_to->head;
|
|
prepend_to->head = to_prepend;
|
|
if(prepend_to->tail == NULL){
|
|
prepend_to->tail = to_prepend;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void _regex_chain_append_chain(regex_chain* append_to, regex_chain* to_append){
|
|
if(append_to && to_append){
|
|
_regex_chain_append_node(append_to, to_append->head);
|
|
if(to_append->tail) {
|
|
append_to->tail = to_append->tail;
|
|
}
|
|
}
|
|
}
|
|
|
|
libregex_result _regex_find_all(regex_node* node, ll* append_to, int tag_with){
|
|
libregex_result result = LIBREGEX_SUCCESS;
|
|
if(node && node->list_id != tag_with){
|
|
node->list_id = tag_with;
|
|
if(node->type == REGEX_VALUE){
|
|
result = _regex_find_all(node->data_u.value_s.next, append_to, tag_with);
|
|
} else if(node->type == REGEX_RANGE){
|
|
result = _regex_find_all(node->data_u.range_s.next, append_to, tag_with);
|
|
} else if(node->type == REGEX_ANY){
|
|
result = _regex_find_all(node->data_u.any_s.next, append_to, tag_with);
|
|
} else if(node->type == REGEX_GROUP){
|
|
result = _regex_find_all(node->data_u.group_s.next, append_to, tag_with);
|
|
} else if(node->type == REGEX_CONNECT){
|
|
result = _regex_find_all(node->data_u.connect_s.next, append_to, tag_with);
|
|
} else if(node->type == REGEX_FORK){
|
|
result = _regex_find_all(node->data_u.fork_s.left, append_to, tag_with);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_find_all(node->data_u.fork_s.right, append_to, tag_with);
|
|
}
|
|
}
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = (ll_append(append_to, node) == LIBDS_SUCCESS) ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_read_value(char* read_into, char* expression, int* string_index){
|
|
libregex_result result = LIBREGEX_SUCCESS;
|
|
if(expression[*string_index] == '\\'){
|
|
(*string_index)++;
|
|
}
|
|
if(expression[*string_index]){
|
|
*read_into = expression[*string_index];
|
|
(*string_index)++;
|
|
} else {
|
|
*read_into = '\0';
|
|
result = LIBREGEX_INVALID;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
libregex_result _regex_build_or(regex_chain** build_into, char* expression, int* string_index){
|
|
regex_node* tail_node = NULL;
|
|
libregex_result result = _regex_chain_create(build_into, NULL, NULL);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
_regex_node_create_connect(&tail_node, NULL);
|
|
_regex_chain_append_node(*build_into, tail_node);
|
|
(*string_index)++;
|
|
}
|
|
|
|
while(result == LIBREGEX_SUCCESS && expression[*string_index] && expression[*string_index] != ']'){
|
|
regex_node* new_node = NULL;
|
|
regex_node* fork = NULL;
|
|
regex_node* new_head = NULL;
|
|
char from = '\0';
|
|
char to = '\0';
|
|
|
|
result = _regex_read_value(&from, expression, string_index);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
if(expression[*string_index] == '-'){
|
|
(*string_index)++;
|
|
result = _regex_read_value(&to, expression, string_index);
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = (to == '\0') ? _regex_node_create_value(&new_node, from, tail_node) :
|
|
_regex_node_create_range(&new_node, from, to, tail_node);
|
|
new_head = new_node;
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS && (*build_into)->head != tail_node){
|
|
result = _regex_node_create_fork(&fork, (*build_into)->head, new_node);
|
|
new_head = fork;
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*build_into)->head = new_head;
|
|
} else {
|
|
free(fork);
|
|
free(new_head);
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
if(expression[*string_index] == ']'){
|
|
(*string_index)++;
|
|
} else {
|
|
result = LIBREGEX_INVALID;
|
|
}
|
|
}
|
|
|
|
if(result != LIBREGEX_SUCCESS){
|
|
regex_free((*build_into)->head);
|
|
free(*build_into);
|
|
*build_into = NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int _regex_is_char_op(char c){
|
|
return c == '*' || c == '+' || c == '?';
|
|
}
|
|
|
|
void _regex_chain_merge(regex_chain** merge_into, regex_chain** to_merge){
|
|
_regex_chain_append_chain(*merge_into, *to_merge);
|
|
free(*to_merge);
|
|
*to_merge = NULL;
|
|
}
|
|
|
|
libregex_result _regex_build_chain(regex_chain** build_into, char* expression, int* string_index, int* groups){
|
|
regex_node* group_open = NULL;
|
|
regex_node* group_close = NULL;
|
|
regex_chain* current_chain = NULL;
|
|
regex_chain* sub_chain = NULL;
|
|
ll chain_stack;
|
|
int is_subchain = *string_index >= 0 && expression[*string_index] == '(';
|
|
libregex_result result = LIBREGEX_SUCCESS;
|
|
|
|
if(is_subchain){
|
|
result = _regex_node_create_group(&group_open, &group_close, ++(*groups));
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_chain_create(¤t_chain, NULL, NULL);
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*string_index)++;
|
|
}
|
|
|
|
ll_init(&chain_stack);
|
|
|
|
while(expression[*string_index] && expression[*string_index] != ')' && result == LIBREGEX_SUCCESS){
|
|
if(_regex_is_char_op(expression[*string_index])){
|
|
if(sub_chain){
|
|
regex_node* connect = NULL;
|
|
regex_node* fork = NULL;
|
|
|
|
result = _regex_node_create_connect(&connect, NULL);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_node_create_fork(&fork, sub_chain->head, connect);
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
if(expression[*string_index] == '+' || expression[*string_index] == '*'){
|
|
_regex_chain_append_node(sub_chain, fork);
|
|
if(expression[*string_index] == '*'){
|
|
sub_chain->head = fork;
|
|
}
|
|
} else {
|
|
_regex_chain_append_node(sub_chain, connect);
|
|
sub_chain->head = fork;
|
|
}
|
|
sub_chain->tail = connect;
|
|
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
(*string_index)++;
|
|
} else {
|
|
free(connect);
|
|
free(fork);
|
|
}
|
|
} else {
|
|
result = LIBREGEX_INVALID;
|
|
}
|
|
} else if(expression[*string_index] == '('){
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
|
|
result = _regex_build_chain(&sub_chain, expression, string_index, groups);
|
|
} else if(expression[*string_index] == '['){
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
|
|
result = _regex_build_or(&sub_chain, expression, string_index);
|
|
} else if(expression[*string_index] == '|'){
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
|
|
result = ll_append(&chain_stack, current_chain) == LIBDS_SUCCESS ? LIBREGEX_SUCCESS : LIBREGEX_MALLOC;
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_chain_create(¤t_chain, NULL, NULL);
|
|
}
|
|
|
|
(*string_index)++;
|
|
} else {
|
|
regex_node* new_node = NULL;
|
|
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
|
|
if(expression[*string_index] == '.'){
|
|
result = _regex_node_create_any(&new_node, NULL);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
(*string_index)++;
|
|
}
|
|
} else {
|
|
char into = '\0';
|
|
result = _regex_read_value(&into, expression, string_index);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_node_create_value(&new_node, into, NULL);
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
result = _regex_chain_create(&sub_chain, new_node, new_node);
|
|
}
|
|
|
|
if(result != LIBREGEX_SUCCESS){
|
|
free(new_node);
|
|
free(sub_chain);
|
|
sub_chain = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
_regex_chain_merge(¤t_chain, &sub_chain);
|
|
} else {
|
|
if(sub_chain){
|
|
regex_free(sub_chain->head);
|
|
free(sub_chain);
|
|
sub_chain = NULL;
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS && is_subchain && expression[*string_index] != ')'){
|
|
result = LIBREGEX_INVALID;
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS && chain_stack.head){
|
|
if(current_chain->tail){
|
|
regex_node* end_node = NULL;
|
|
result = _regex_node_create_connect(&end_node, NULL);
|
|
_regex_chain_append_node(current_chain, end_node);
|
|
|
|
|
|
while(chain_stack.tail && result == LIBREGEX_SUCCESS){
|
|
regex_node* fork = NULL;
|
|
regex_chain* new_chain = ll_poptail(&chain_stack);
|
|
result = _regex_node_create_fork(&fork, current_chain->head, new_chain->head);
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
_regex_chain_append_node(new_chain, end_node);
|
|
current_chain->head = fork;
|
|
} else {
|
|
if(new_chain){
|
|
regex_free(new_chain->head);
|
|
}
|
|
}
|
|
|
|
free(new_chain);
|
|
}
|
|
} else {
|
|
result = LIBREGEX_INVALID;
|
|
}
|
|
}
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
if(is_subchain){
|
|
(*string_index)++;
|
|
}
|
|
_regex_chain_append_node(current_chain, group_close);
|
|
_regex_chain_prepend_node(current_chain, group_open);
|
|
*build_into = current_chain;
|
|
} else {
|
|
while(chain_stack.tail){
|
|
regex_chain* new_chain = ll_poptail(&chain_stack);
|
|
if(new_chain){
|
|
regex_free(new_chain->head);
|
|
free(new_chain);
|
|
}
|
|
}
|
|
|
|
if(current_chain){
|
|
regex_free(current_chain->head);
|
|
free(current_chain);
|
|
}
|
|
free(group_close);
|
|
free(group_open);
|
|
*build_into = NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void regex_node_clear(regex_node* node){
|
|
node->type = REGEX_CLEAR;
|
|
node->list_id = -1;
|
|
memset(&node->data_u, 0, sizeof(node->data_u));
|
|
}
|
|
libregex_result regex_free(regex_node* root){
|
|
libregex_result result;
|
|
ll found_list;
|
|
ll_init(&found_list);
|
|
result = _regex_find_all(root, &found_list, -2);
|
|
ll_foreach(&found_list, NULL, compare_always, _regex_node_foreach_free);
|
|
ll_free(&found_list);
|
|
return result;
|
|
}
|
|
libregex_result regex_build(regex_node** root, char* expression){
|
|
libregex_result result;
|
|
regex_chain* building_chain = NULL;
|
|
int index = -1;
|
|
int number_groups = 0;
|
|
regex_node* end_node = NULL;
|
|
result = _regex_node_create_clear(&end_node);
|
|
if(result == LIBREGEX_SUCCESS){
|
|
end_node->type = REGEX_END;
|
|
result = _regex_build_chain(&building_chain, expression, &index, &number_groups);
|
|
}
|
|
if(result == LIBREGEX_SUCCESS){
|
|
_regex_chain_append_node(building_chain, end_node);
|
|
*root = building_chain->head;
|
|
free(building_chain);
|
|
} else {
|
|
free(end_node);
|
|
if(building_chain){
|
|
regex_free(building_chain->head);
|
|
free(building_chain);
|
|
}
|
|
*root = NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int _regex_node_matches(regex_node* node, char c){
|
|
int matches = 0;
|
|
if(node->type == REGEX_VALUE){
|
|
matches = node->data_u.value_s.value == c;
|
|
} else if(node->type == REGEX_RANGE){
|
|
matches = node->data_u.range_s.from <= c && c <= node->data_u.range_s.to;
|
|
} else if(node->type == REGEX_ANY){
|
|
matches = c != '\0';
|
|
}
|
|
return matches;
|
|
}
|
|
|
|
void _regex_node_add(regex_node* node, regex_list* list){
|
|
if(node && node->list_id < list->id && list->size < LIBREGEX_MAX_NODE_COUNT){
|
|
node->list_id = list->id;
|
|
if(node->type == REGEX_CONNECT){
|
|
_regex_node_add(node->data_u.connect_s.next, list);
|
|
} else if(node->type == REGEX_FORK){
|
|
_regex_node_add(node->data_u.fork_s.left, list);
|
|
_regex_node_add(node->data_u.fork_s.right, list);
|
|
} else if(!(node->type == REGEX_CLEAR)){
|
|
if(node->type == REGEX_GROUP){
|
|
_regex_node_add(node->data_u.group_s.next, list);
|
|
}
|
|
list->nodes[list->size++] = node;
|
|
}
|
|
}
|
|
}
|
|
|
|
libregex_result _regex_step(regex_sim* sim){
|
|
libregex_result result = LIBREGEX_SUCCESS;
|
|
regex_list* swap_temp;
|
|
int index = 0;
|
|
for(; index < sim->current->size; index++){
|
|
regex_node* current = sim->current->nodes[index];
|
|
if(_regex_node_matches(current, sim->string[sim->index])){
|
|
regex_node** next = _regex_node_get_next(current);
|
|
if(next){
|
|
_regex_node_add(*next, sim->next);
|
|
}
|
|
} else if(current->type == REGEX_END){
|
|
sim->result->matches = 1;
|
|
} else if(current->type == REGEX_GROUP && sim->string[sim->index - 1 + current->data_u.group_s.open] != '\0'){
|
|
int* to_set = (current->data_u.group_s.open) ?
|
|
&(sim->groups[current->data_u.group_s.id - 1].from) :
|
|
&(sim->groups[current->data_u.group_s.id - 1].to);
|
|
*to_set = sim->index - 1 + current->data_u.group_s.open;
|
|
|
|
if(current->data_u.group_s.open == 0 && current->data_u.group_s.id < LIBREGEX_MAX_GROUP_COUNT){
|
|
regex_match* new_match = sim->result->groups[current->data_u.group_s.id - 1];
|
|
if(new_match == NULL){
|
|
new_match = malloc(sizeof(*new_match));
|
|
}
|
|
|
|
if(new_match){
|
|
new_match->from = sim->groups[current->data_u.group_s.id - 1].from;
|
|
new_match->to = sim->groups[current->data_u.group_s.id - 1].to;
|
|
sim->result->groups[current->data_u.group_s.id - 1] = new_match;
|
|
} else {
|
|
result = LIBREGEX_MALLOC;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
swap_temp = sim->current;
|
|
sim->current = sim->next;
|
|
sim->next = swap_temp;
|
|
|
|
sim->next->size = 0;
|
|
sim->next->id += 2;
|
|
|
|
return result;
|
|
}
|
|
|
|
libregex_result regex_match_string(regex_node* root, char* string, regex_result* build_result){
|
|
ll clear_ll;
|
|
libregex_result result;
|
|
regex_sim sim;
|
|
regex_list list_a;
|
|
regex_list list_b;
|
|
|
|
list_a.size = 0;
|
|
list_a.id = 0;
|
|
|
|
list_b.size = 0;
|
|
list_b.id = 1;
|
|
|
|
build_result->matches = 0;
|
|
memset(&build_result->groups, 0, sizeof(build_result->groups));
|
|
|
|
memset(&sim.groups, 0, sizeof(sim.groups));
|
|
sim.current = &list_a;
|
|
sim.next = &list_b;
|
|
sim.result = build_result;
|
|
sim.string = string;
|
|
sim.index = 0;
|
|
|
|
_regex_node_add(root, &list_a);
|
|
|
|
do {
|
|
result = _regex_step(&sim);
|
|
} while(sim.string[sim.index++] != '\0' && result == LIBREGEX_SUCCESS);
|
|
|
|
if(result == LIBREGEX_SUCCESS){
|
|
ll_init(&clear_ll);
|
|
result = _regex_find_all(root, &clear_ll, -1);
|
|
ll_clear(&clear_ll);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
void regex_result_free(regex_result* result){
|
|
int index = 0;
|
|
for(; index < LIBREGEX_MAX_GROUP_COUNT; index++){
|
|
free(result->groups[index]);
|
|
}
|
|
result->matches = 0;
|
|
}
|
|
|