#ifndef LIBREGEX_HEADER #define LIBREGEX_HEADER #define LIBREGEX_MAX_NODE_COUNT 64 #define LIBREGEX_MAX_GROUP_COUNT 64 /** * Enum used to represent the result * of error prone libregex functions. */ enum libregex_result_e { /** * Represents a successful operation. All went well. */ LIBREGEX_SUCCESS, /** * Represents an invalid regular expression. */ LIBREGEX_INVALID, /** * Represents an allocation failure. */ LIBREGEX_MALLOC }; /** * An enum to represent the tagged union that is * a regular expression NFA node. */ enum regex_node_type_e { /** * Represents a node whose value was not yet set. * This is usually the case after the initial creation of the node */ REGEX_CLEAR, /** * Represents a value node. * The value node matches a single character from the input. */ REGEX_VALUE, /** * Represents a range node. * The range node matches a range of characters from the input. */ REGEX_RANGE, /** * Represents an "any" node. * The any node matches any character from the input. */ REGEX_ANY, /** * Represents a connection node, only used for structural * purposes. */ REGEX_CONNECT, /** * Represents a fork node. * This node is used for structural purposes, but, unlike the * connect node, can transition into two NFA nodes. */ REGEX_FORK, /** * Represents the beginning or end of a group. * Beyond its use for matching substrings, the group node is treated * like a connection node. */ REGEX_GROUP, /** * Represents the end of the NFA, and a successful match. */ REGEX_END }; /** * Struct representing a single NFA node. */ struct regex_node_s { /** * The type of the NFA node. */ enum regex_node_type_e type; /** * The ID of the list this node was last added into. * The list ID represents the last "state" list this node * was added to, to prevent the node being checked multiple times * during matching. * * Outside of matching, this is also used for iteration over NFA nodes: * once a node is iterated over, its list ID is set to a certain value * believed to be unique during the iteration so that it is not re-checked. */ int list_id; /** * The union part of the "tagged union". * data_u represents the possible data types * that the node can represent. */ union { /** * Represents data carried by a value node. */ struct { /** * The value the node matches. */ char value; /** * The next node in the NFA. */ struct regex_node_s* next; } value_s; /** * Represents data carried by a value node. */ struct { /** * Represents the bottom bounds of the range, inclusive. */ char from; /** * Represents the top bounds of the range, inclusive. */ char to; /** * The next node in the NFA. */ struct regex_node_s* next; } range_s; /** * Represents data carried by an "any" node. */ struct { /** * The next node in the NFA. */ struct regex_node_s* next; } any_s; /** * Represents data carried by a connection node. */ struct { /** * The next node in the NFA. */ struct regex_node_s* next; } connect_s; /** * Represents data carried by a fork node. */ struct { /** * The first next node in the NFA. */ struct regex_node_s* left; /** * The second next node in the NFA. */ struct regex_node_s* right; } fork_s; /** * Represents data carried by a group node. */ struct { /** * Boolean, whether this is the beginning or end of a group. */ int open; /** * The id of the group. */ int id; /** * The other group node of the same ID. */ struct regex_node_s* other; /** * The next node in the NFA. */ struct regex_node_s* next; } group_s; } data_u; }; /** * Represents a small NFA to be treated as a unit. */ struct regex_chain_s { /** * The first node in the NFA chain. */ struct regex_node_s* head; /** * The last node in the NFA chain. */ struct regex_node_s* tail; }; /** * Struct that represents a list of nodes currently being * checked against input. */ struct regex_list_s { /** * The ID of the list is used together with * regex_node_s' list_id to prevent multiple * pointers to the same node in the list. */ int id; /** * The number of nodes currently in the list. */ int size; /** * The list of nodes. */ struct regex_node_s* nodes[LIBREGEX_MAX_NODE_COUNT]; }; /** * Represents a single matched group. */ struct regex_match_s { /** * The starting index of the match in the string, inclusive. */ int from; /** * The ending index of the match in the string, inclusive. */ int to; }; /** * Struct that represents he result of running a match. */ struct regex_result_s { /** * Boolean, whether the regular expression matched or not. */ int matches; /** * List of groups that were matched successfully. */ struct regex_match_s* groups[LIBREGEX_MAX_GROUP_COUNT]; }; /** * Struct that represents data used to match * a string against a regular expression. */ struct regex_sim_s { /** * The string being matched. */ char* string; /** * The index of the string currently being inspected. */ int index; /** * The current list of states ready to be matched. */ struct regex_list_s* current; /** * The list of states to be matched in the next index. */ struct regex_list_s* next; /** * The result struct being built during the match. */ struct regex_result_s* result; /** * A list of groups potentially being constructed. */ struct regex_match_s groups[LIBREGEX_MAX_GROUP_COUNT]; }; typedef enum libregex_result_e libregex_result; typedef enum regex_node_type_e regex_node_type; typedef struct regex_node_s regex_node; typedef struct regex_chain_s regex_chain; typedef struct regex_list_s regex_list; typedef struct regex_match_s regex_match; typedef struct regex_result_s regex_result; typedef struct regex_sim_s regex_sim; /** * Clears a node completely, resetting all the values to default. * @param node the node to clear. */ void regex_node_clear(regex_node* node); /** * Frees an entire NFA, starting from the given root node. * @param root the root, or starting node, of the NFA. * @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code. */ libregex_result regex_free(regex_node* root); /** * Builds a regular expression from the given regular expression strings. * @param root the root node to build into. * @param expression the expression to construct the NFA from. * @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code. */ libregex_result regex_build(regex_node** root, char* expression); /** * Matches the regular expression against a given string. * @param root the root of a regular expression NFA * @param string the string to be matched * @param result the result to be populated with the data from matching the string. * @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code. */ libregex_result regex_match_string(regex_node* root, char* string, regex_result* result); /** * Frees data used by regex_match_string in a regex_result struct. * The actual struct is not freed. * @param result the result struct */ void regex_result_free(regex_result* result); #endif