2017-01-02 17:54:25 -08:00
|
|
|
#ifndef LIBREGEX_HEADER
|
|
|
|
#define LIBREGEX_HEADER
|
|
|
|
|
2017-01-02 21:37:40 -08:00
|
|
|
#define LIBREGEX_MAX_NODE_COUNT 64
|
|
|
|
#define LIBREGEX_MAX_GROUP_COUNT 64
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Enum used to represent the result
|
|
|
|
* of error prone libregex functions.
|
|
|
|
*/
|
|
|
|
enum libregex_result_e {
|
|
|
|
/**
|
|
|
|
* Represents a successful operation. All went well.
|
|
|
|
*/
|
|
|
|
LIBREGEX_SUCCESS,
|
|
|
|
/**
|
|
|
|
* Represents an invalid regular expression.
|
|
|
|
*/
|
|
|
|
LIBREGEX_INVALID,
|
|
|
|
/**
|
|
|
|
* Represents an allocation failure.
|
|
|
|
*/
|
|
|
|
LIBREGEX_MALLOC
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* An enum to represent the tagged union that is
|
|
|
|
* a regular expression NFA node.
|
|
|
|
*/
|
|
|
|
enum regex_node_type_e {
|
|
|
|
/**
|
|
|
|
* Represents a node whose value was not yet set.
|
|
|
|
* This is usually the case after the initial creation of the node
|
|
|
|
*/
|
|
|
|
REGEX_CLEAR,
|
|
|
|
/**
|
|
|
|
* Represents a value node.
|
|
|
|
* The value node matches a single character from the input.
|
|
|
|
*/
|
|
|
|
REGEX_VALUE,
|
|
|
|
/**
|
|
|
|
* Represents a range node.
|
|
|
|
* The range node matches a range of characters from the input.
|
|
|
|
*/
|
|
|
|
REGEX_RANGE,
|
|
|
|
/**
|
|
|
|
* Represents an "any" node.
|
|
|
|
* The any node matches any character from the input.
|
|
|
|
*/
|
|
|
|
REGEX_ANY,
|
|
|
|
/**
|
|
|
|
* Represents a connection node, only used for structural
|
|
|
|
* purposes.
|
|
|
|
*/
|
|
|
|
REGEX_CONNECT,
|
|
|
|
/**
|
|
|
|
* Represents a fork node.
|
|
|
|
* This node is used for structural purposes, but, unlike the
|
|
|
|
* connect node, can transition into two NFA nodes.
|
|
|
|
*/
|
|
|
|
REGEX_FORK,
|
|
|
|
/**
|
|
|
|
* Represents the beginning or end of a group.
|
|
|
|
* Beyond its use for matching substrings, the group node is treated
|
|
|
|
* like a connection node.
|
|
|
|
*/
|
|
|
|
REGEX_GROUP,
|
|
|
|
/**
|
|
|
|
* Represents the end of the NFA, and a successful match.
|
|
|
|
*/
|
|
|
|
REGEX_END
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Struct representing a single NFA node.
|
|
|
|
*/
|
|
|
|
struct regex_node_s {
|
|
|
|
/**
|
|
|
|
* The type of the NFA node.
|
|
|
|
*/
|
|
|
|
enum regex_node_type_e type;
|
|
|
|
/**
|
|
|
|
* The ID of the list this node was last added into.
|
|
|
|
* The list ID represents the last "state" list this node
|
|
|
|
* was added to, to prevent the node being checked multiple times
|
|
|
|
* during matching.
|
|
|
|
*
|
|
|
|
* Outside of matching, this is also used for iteration over NFA nodes:
|
|
|
|
* once a node is iterated over, its list ID is set to a certain value
|
|
|
|
* believed to be unique during the iteration so that it is not re-checked.
|
|
|
|
*/
|
|
|
|
int list_id;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The union part of the "tagged union".
|
|
|
|
* data_u represents the possible data types
|
|
|
|
* that the node can represent.
|
|
|
|
*/
|
|
|
|
union {
|
|
|
|
/**
|
|
|
|
* Represents data carried by a value node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* The value the node matches.
|
|
|
|
*/
|
|
|
|
char value;
|
|
|
|
/**
|
|
|
|
* The next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* next;
|
|
|
|
} value_s;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents data carried by a value node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* Represents the bottom bounds of the range, inclusive.
|
|
|
|
*/
|
|
|
|
char from;
|
|
|
|
/**
|
|
|
|
* Represents the top bounds of the range, inclusive.
|
|
|
|
*/
|
|
|
|
char to;
|
|
|
|
/**
|
|
|
|
* The next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* next;
|
|
|
|
} range_s;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents data carried by an "any" node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* The next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* next;
|
|
|
|
} any_s;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents data carried by a connection node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* The next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* next;
|
|
|
|
} connect_s;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents data carried by a fork node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* The first next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* left;
|
|
|
|
/**
|
|
|
|
* The second next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* right;
|
|
|
|
} fork_s;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents data carried by a group node.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
|
|
|
* Boolean, whether this is the beginning or end of a group.
|
|
|
|
*/
|
|
|
|
int open;
|
|
|
|
/**
|
|
|
|
* The id of the group.
|
|
|
|
*/
|
|
|
|
int id;
|
|
|
|
/**
|
|
|
|
* The other group node of the same ID.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* other;
|
|
|
|
/**
|
|
|
|
* The next node in the NFA.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* next;
|
|
|
|
} group_s;
|
|
|
|
} data_u;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents a small NFA to be treated as a unit.
|
|
|
|
*/
|
|
|
|
struct regex_chain_s {
|
|
|
|
/**
|
|
|
|
* The first node in the NFA chain.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* head;
|
|
|
|
/**
|
|
|
|
* The last node in the NFA chain.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* tail;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Struct that represents a list of nodes currently being
|
|
|
|
* checked against input.
|
|
|
|
*/
|
|
|
|
struct regex_list_s {
|
|
|
|
/**
|
|
|
|
* The ID of the list is used together with
|
|
|
|
* regex_node_s' list_id to prevent multiple
|
|
|
|
* pointers to the same node in the list.
|
|
|
|
*/
|
|
|
|
int id;
|
|
|
|
/**
|
|
|
|
* The number of nodes currently in the list.
|
|
|
|
*/
|
|
|
|
int size;
|
|
|
|
/**
|
|
|
|
* The list of nodes.
|
|
|
|
*/
|
|
|
|
struct regex_node_s* nodes[LIBREGEX_MAX_NODE_COUNT];
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Represents a single matched group.
|
|
|
|
*/
|
|
|
|
struct regex_match_s {
|
|
|
|
/**
|
|
|
|
* The starting index of the match in the string, inclusive.
|
|
|
|
*/
|
|
|
|
int from;
|
|
|
|
/**
|
|
|
|
* The ending index of the match in the string, inclusive.
|
|
|
|
*/
|
|
|
|
int to;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Struct that represents he result of running a match.
|
|
|
|
*/
|
|
|
|
struct regex_result_s {
|
|
|
|
/**
|
|
|
|
* Boolean, whether the regular expression matched or not.
|
|
|
|
*/
|
|
|
|
int matches;
|
|
|
|
/**
|
|
|
|
* List of groups that were matched successfully.
|
|
|
|
*/
|
|
|
|
struct regex_match_s* groups[LIBREGEX_MAX_GROUP_COUNT];
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Struct that represents data used to match
|
|
|
|
* a string against a regular expression.
|
|
|
|
*/
|
|
|
|
struct regex_sim_s {
|
|
|
|
/**
|
|
|
|
* The string being matched.
|
|
|
|
*/
|
|
|
|
char* string;
|
|
|
|
/**
|
|
|
|
* The index of the string currently being inspected.
|
|
|
|
*/
|
|
|
|
int index;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The current list of states ready to be matched.
|
|
|
|
*/
|
|
|
|
struct regex_list_s* current;
|
|
|
|
/**
|
|
|
|
* The list of states to be matched in the next index.
|
|
|
|
*/
|
|
|
|
struct regex_list_s* next;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The result struct being built during the match.
|
|
|
|
*/
|
|
|
|
struct regex_result_s* result;
|
|
|
|
/**
|
|
|
|
* A list of groups potentially being constructed.
|
|
|
|
*/
|
|
|
|
struct regex_match_s groups[LIBREGEX_MAX_GROUP_COUNT];
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef enum libregex_result_e libregex_result;
|
|
|
|
typedef enum regex_node_type_e regex_node_type;
|
|
|
|
typedef struct regex_node_s regex_node;
|
|
|
|
typedef struct regex_chain_s regex_chain;
|
|
|
|
typedef struct regex_list_s regex_list;
|
|
|
|
typedef struct regex_match_s regex_match;
|
|
|
|
typedef struct regex_result_s regex_result;
|
|
|
|
typedef struct regex_sim_s regex_sim;
|
|
|
|
|
2017-01-05 21:27:07 -08:00
|
|
|
/**
|
|
|
|
* Clears a node completely, resetting all the values to default.
|
|
|
|
* @param node the node to clear.
|
|
|
|
*/
|
|
|
|
void regex_node_clear(regex_node* node);
|
|
|
|
/**
|
|
|
|
* Frees an entire NFA, starting from the given root node.
|
|
|
|
* @param root the root, or starting node, of the NFA.
|
2017-01-08 00:04:31 -08:00
|
|
|
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
|
2017-01-05 21:27:07 -08:00
|
|
|
*/
|
2017-01-07 22:15:18 -08:00
|
|
|
libregex_result regex_free(regex_node* root);
|
2017-01-05 21:27:07 -08:00
|
|
|
/**
|
|
|
|
* Builds a regular expression from the given regular expression strings.
|
|
|
|
* @param root the root node to build into.
|
|
|
|
* @param expression the expression to construct the NFA from.
|
|
|
|
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
|
|
|
|
*/
|
|
|
|
libregex_result regex_build(regex_node** root, char* expression);
|
|
|
|
/**
|
|
|
|
* Matches the regular expression against a given string.
|
|
|
|
* @param root the root of a regular expression NFA
|
|
|
|
* @param string the string to be matched
|
|
|
|
* @param result the result to be populated with the data from matching the string.
|
|
|
|
* @return the result of the operation: LIBREGEX_SUCCESS if all goes well, or an error code.
|
|
|
|
*/
|
|
|
|
libregex_result regex_match_string(regex_node* root, char* string, regex_result* result);
|
2017-01-08 00:04:31 -08:00
|
|
|
/**
|
|
|
|
* Frees data used by regex_match_string in a regex_result struct.
|
|
|
|
* The actual struct is not freed.
|
|
|
|
* @param result the result struct
|
|
|
|
*/
|
|
|
|
void regex_result_free(regex_result* result);
|
2017-01-05 21:27:07 -08:00
|
|
|
|
2017-01-02 17:54:25 -08:00
|
|
|
#endif
|