libregex/include/libregex.h

330 lines
7.6 KiB
C

#ifndef LIBREGEX_HEADER
#define LIBREGEX_HEADER
#define LIBREGEX_MAX_NODE_COUNT 64
#define LIBREGEX_MAX_GROUP_COUNT 64
/**
* Enum used to represent the result
* of error prone libregex functions.
*/
enum libregex_result_e {
/**
* Represents a successful operation. All went well.
*/
LIBREGEX_SUCCESS,
/**
* Represents an invalid regular expression.
*/
LIBREGEX_INVALID,
/**
* Represents an allocation failure.
*/
LIBREGEX_MALLOC
};
/**
* An enum to represent the tagged union that is
* a regular expression NFA node.
*/
enum regex_node_type_e {
/**
* Represents a node whose value was not yet set.
* This is usually the case after the initial creation of the node
*/
REGEX_CLEAR,
/**
* Represents a value node.
* The value node matches a single character from the input.
*/
REGEX_VALUE,
/**
* Represents a range node.
* The range node matches a range of characters from the input.
*/
REGEX_RANGE,
/**
* Represents an "any" node.
* The any node matches any character from the input.
*/
REGEX_ANY,
/**
* Represents a connection node, only used for structural
* purposes.
*/
REGEX_CONNECT,
/**
* Represents a fork node.
* This node is used for structural purposes, but, unlike the
* connect node, can transition into two NFA nodes.
*/
REGEX_FORK,
/**
* Represents the beginning or end of a group.
* Beyond its use for matching substrings, the group node is treated
* like a connection node.
*/
REGEX_GROUP,
/**
* Represents the end of the NFA, and a successful match.
*/
REGEX_END
};
/**
* Struct representing a single NFA node.
*/
struct regex_node_s {
/**
* The type of the NFA node.
*/
enum regex_node_type_e type;
/**
* The ID of the list this node was last added into.
* The list ID represents the last "state" list this node
* was added to, to prevent the node being checked multiple times
* during matching.
*
* Outside of matching, this is also used for iteration over NFA nodes:
* once a node is iterated over, its list ID is set to a certain value
* believed to be unique during the iteration so that it is not re-checked.
*/
int list_id;
/**
* The union part of the "tagged union".
* data_u represents the possible data types
* that the node can represent.
*/
union {
/**
* Represents data carried by a value node.
*/
struct {
/**
* The value the node matches.
*/
char value;
/**
* The next node in the NFA.
*/
struct regex_node_s* next;
} value_s;
/**
* Represents data carried by a value node.
*/
struct {
/**
* Represents the bottom bounds of the range, inclusive.
*/
char from;
/**
* Represents the top bounds of the range, inclusive.
*/
char to;
/**
* The next node in the NFA.
*/
struct regex_node_s* next;
} range_s;
/**
* Represents data carried by an "any" node.
*/
struct {
/**
* The next node in the NFA.
*/
struct regex_node_s* next;
} any_s;
/**
* Represents data carried by a connection node.
*/
struct {
/**
* The next node in the NFA.
*/
struct regex_node_s* next;
} connect_s;
/**
* Represents data carried by a fork node.
*/
struct {
/**
* The first next node in the NFA.
*/
struct regex_node_s* left;
/**
* The second next node in the NFA.
*/
struct regex_node_s* right;
} fork_s;
/**
* Represents data carried by a group node.
*/
struct {
/**
* Boolean, whether this is the beginning or end of a group.
*/
int open;
/**
* The id of the group.
*/
int id;
/**
* The other group node of the same ID.
*/
struct regex_node_s* other;
/**
* The next node in the NFA.
*/
struct regex_node_s* next;
} group_s;
} data_u;
};
/**
* Represents a small NFA to be treated as a unit.
*/
struct regex_chain_s {
/**
* The first node in the NFA chain.
*/
struct regex_node_s* head;
/**
* The last node in the NFA chain.
*/
struct regex_node_s* tail;
};
/**
* Struct that represents a list of nodes currently being
* checked against input.
*/
struct regex_list_s {
/**
* The ID of the list is used together with
* regex_node_s' list_id to prevent multiple
* pointers to the same node in the list.
*/
int id;
/**
* The number of nodes currently in the list.
*/
int size;
/**
* The list of nodes.
*/
struct regex_node_s* nodes[LIBREGEX_MAX_NODE_COUNT];
};
/**
* Represents a single matched group.
*/
struct regex_match_s {
/**
* The starting index of the match in the string, inclusive.
*/
int from;
/**
* The ending index of the match in the string, inclusive.
*/
int to;
};
/**
* Struct that represents he result of running a match.
*/
struct regex_result_s {
/**
* Boolean, whether the regular expression matched or not.
*/
int matches;
/**
* List of groups that were matched successfully.
*/
struct regex_match_s* groups[LIBREGEX_MAX_GROUP_COUNT];
};
/**
* Struct that represents data used to match
* a string against a regular expression.
*/
struct regex_sim_s {
/**
* The string being matched.
*/
char* string;
/**
* The index of the string currently being inspected.
*/
int index;
/**
* The current list of states ready to be matched.
*/
struct regex_list_s* current;
/**
* The list of states to be matched in the next index.
*/
struct regex_list_s* next;
/**
* The result struct being built during the match.
*/
struct regex_result_s* result;
/**
* A list of groups potentially being constructed.
*/
struct regex_match_s groups[LIBREGEX_MAX_GROUP_COUNT];
};
typedef enum libregex_result_e libregex_result;
typedef enum regex_node_type_e regex_node_type;
typedef struct regex_node_s regex_node;
typedef struct regex_chain_s regex_chain;
typedef struct regex_list_s regex_list;
typedef struct regex_match_s regex_match;
typedef struct regex_result_s regex_result;
typedef struct regex_sim_s regex_sim;
/**
* Clears a node completely, resetting all the values to default.
* @param node the node to clear.
*/
void regex_node_clear(regex_node* node);
/**
* Frees an entire NFA, starting from the given root node.
* @param root the root, or starting node, of the NFA.
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
*/
libregex_result regex_free(regex_node* root);
/**
* Builds a regular expression from the given regular expression strings.
* @param root the root node to build into.
* @param expression the expression to construct the NFA from.
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
*/
libregex_result regex_build(regex_node** root, char* expression);
/**
* Matches the regular expression against a given string.
* @param root the root of a regular expression NFA
* @param string the string to be matched
* @param result the result to be populated with the data from matching the string.
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
*/
libregex_result regex_match_string(regex_node* root, char* string, regex_result* result);
/**
* Frees data used by regex_match_string in a regex_result struct.
* The actual struct is not freed.
* @param result the result struct
*/
void regex_result_free(regex_result* result);
#endif