From 7f370983f28263eadfb20f25a0015c8a9f77e757 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Mon, 2 Jan 2017 21:37:40 -0800 Subject: [PATCH] Add struct declarations to libregex.h, and also typedef them. --- include/libregex.h | 291 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) diff --git a/include/libregex.h b/include/libregex.h index 37043bf..ef16101 100644 --- a/include/libregex.h +++ b/include/libregex.h @@ -1,4 +1,295 @@ #ifndef LIBREGEX_HEADER #define LIBREGEX_HEADER +#define LIBREGEX_MAX_NODE_COUNT 64 +#define LIBREGEX_MAX_GROUP_COUNT 64 + +/** + * Enum used to represent the result + * of error prone libregex functions. + */ +enum libregex_result_e { + /** + * Represents a successful operation. All went well. + */ + LIBREGEX_SUCCESS, + /** + * Represents an invalid regular expression. + */ + LIBREGEX_INVALID, + /** + * Represents an allocation failure. + */ + LIBREGEX_MALLOC +}; + +/** + * An enum to represent the tagged union that is + * a regular expression NFA node. + */ +enum regex_node_type_e { + /** + * Represents a node whose value was not yet set. + * This is usually the case after the initial creation of the node + */ + REGEX_CLEAR, + /** + * Represents a value node. + * The value node matches a single character from the input. + */ + REGEX_VALUE, + /** + * Represents a range node. + * The range node matches a range of characters from the input. + */ + REGEX_RANGE, + /** + * Represents an "any" node. + * The any node matches any character from the input. + */ + REGEX_ANY, + /** + * Represents a connection node, only used for structural + * purposes. + */ + REGEX_CONNECT, + /** + * Represents a fork node. + * This node is used for structural purposes, but, unlike the + * connect node, can transition into two NFA nodes. + */ + REGEX_FORK, + /** + * Represents the beginning or end of a group. + * Beyond its use for matching substrings, the group node is treated + * like a connection node. + */ + REGEX_GROUP, + /** + * Represents the end of the NFA, and a successful match. + */ + REGEX_END +}; + +/** + * Struct representing a single NFA node. + */ +struct regex_node_s { + /** + * The type of the NFA node. + */ + enum regex_node_type_e type; + /** + * The ID of the list this node was last added into. + * The list ID represents the last "state" list this node + * was added to, to prevent the node being checked multiple times + * during matching. + * + * Outside of matching, this is also used for iteration over NFA nodes: + * once a node is iterated over, its list ID is set to a certain value + * believed to be unique during the iteration so that it is not re-checked. + */ + int list_id; + + /** + * The union part of the "tagged union". + * data_u represents the possible data types + * that the node can represent. + */ + union { + /** + * Represents data carried by a value node. + */ + struct { + /** + * The value the node matches. + */ + char value; + /** + * The next node in the NFA. + */ + struct regex_node_s* next; + } value_s; + + /** + * Represents data carried by a value node. + */ + struct { + /** + * Represents the bottom bounds of the range, inclusive. + */ + char from; + /** + * Represents the top bounds of the range, inclusive. + */ + char to; + /** + * The next node in the NFA. + */ + struct regex_node_s* next; + } range_s; + + /** + * Represents data carried by an "any" node. + */ + struct { + /** + * The next node in the NFA. + */ + struct regex_node_s* next; + } any_s; + + /** + * Represents data carried by a connection node. + */ + struct { + /** + * The next node in the NFA. + */ + struct regex_node_s* next; + } connect_s; + + /** + * Represents data carried by a fork node. + */ + struct { + /** + * The first next node in the NFA. + */ + struct regex_node_s* left; + /** + * The second next node in the NFA. + */ + struct regex_node_s* right; + } fork_s; + + /** + * Represents data carried by a group node. + */ + struct { + /** + * Boolean, whether this is the beginning or end of a group. + */ + int open; + /** + * The id of the group. + */ + int id; + /** + * The other group node of the same ID. + */ + struct regex_node_s* other; + /** + * The next node in the NFA. + */ + struct regex_node_s* next; + } group_s; + } data_u; +}; + +/** + * Represents a small NFA to be treated as a unit. + */ +struct regex_chain_s { + /** + * The first node in the NFA chain. + */ + struct regex_node_s* head; + /** + * The last node in the NFA chain. + */ + struct regex_node_s* tail; +}; + +/** + * Struct that represents a list of nodes currently being + * checked against input. + */ +struct regex_list_s { + /** + * The ID of the list is used together with + * regex_node_s' list_id to prevent multiple + * pointers to the same node in the list. + */ + int id; + /** + * The number of nodes currently in the list. + */ + int size; + /** + * The list of nodes. + */ + struct regex_node_s* nodes[LIBREGEX_MAX_NODE_COUNT]; +}; + +/** + * Represents a single matched group. + */ +struct regex_match_s { + /** + * The starting index of the match in the string, inclusive. + */ + int from; + /** + * The ending index of the match in the string, inclusive. + */ + int to; +}; + +/** + * Struct that represents he result of running a match. + */ +struct regex_result_s { + /** + * Boolean, whether the regular expression matched or not. + */ + int matches; + /** + * List of groups that were matched successfully. + */ + struct regex_match_s* groups[LIBREGEX_MAX_GROUP_COUNT]; +}; + +/** + * Struct that represents data used to match + * a string against a regular expression. + */ +struct regex_sim_s { + /** + * The string being matched. + */ + char* string; + /** + * The index of the string currently being inspected. + */ + int index; + + /** + * The current list of states ready to be matched. + */ + struct regex_list_s* current; + /** + * The list of states to be matched in the next index. + */ + struct regex_list_s* next; + + /** + * The result struct being built during the match. + */ + struct regex_result_s* result; + /** + * A list of groups potentially being constructed. + */ + struct regex_match_s groups[LIBREGEX_MAX_GROUP_COUNT]; +}; + +typedef enum libregex_result_e libregex_result; +typedef enum regex_node_type_e regex_node_type; +typedef struct regex_node_s regex_node; +typedef struct regex_chain_s regex_chain; +typedef struct regex_list_s regex_list; +typedef struct regex_match_s regex_match; +typedef struct regex_result_s regex_result; +typedef struct regex_sim_s regex_sim; + #endif \ No newline at end of file