-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspell-checker_data.c
221 lines (197 loc) · 5.75 KB
/
spell-checker_data.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "spell-checker_data.h"
/**
* A Trie implementation to hold the dictionary data.
*
* See https://en.wikipedia.org/wiki/Trie for a description of a Trie.
*
* TODO: Extract the implementation out, put into specific implementation file,
* to easily test with other data-structures.
*
* This implementation is a bit wasteful in memory, but should be rather fast.
* It is wasteful, as it is allocating an empty array for each node created,
* even though potentially none of the children will be used. In practice, most
* of them will probably not be used (TODO: measure how much exactly).
* It is, however, rather fast, as the access to each element is O(1), as each
* child is located at the index of it's character representation.
*
* Visual representation of the Trie containing a single word 'air':
*
* (0) <-- Root
* ______|______
* | | |
* (0)...(a)...(0xFF) <-- ... 256 children (all but one empty)
* ______|_______
* | | |
* (0)...(i)...(0xFF) <-- ... 256 children (all but one empty)
* ______|_______
* | | |
* (0)...(r)...(0xFF) <-- ... 256 children (all but one empty)
* ______|_______
* | |
* (0) (0xFF) <-- ... 256 children (all empty)
*
*
* Multiple strategies may be implemented to improve memory waste: allocating
* nodes for legal characters only, allocating nodes as-needed only, not
* allocating nodes for the leafs, etc.. All have different pros and cons.
*
* Each node in the Trie holds a single character and a list of children.
* The root node has its character element set to 0.
* Upon initialization, only the root node is created, with its children's
* array zero-ed out.
* Every time a word is added, a new node is created in place of the appropriate
* child node.
*/
/*
*Max ascii values (with extended characters)
*/
#define MAX_CHARS_PER_NODE 256
/*
* Each data node holds a character and an array of MAX_CHARS_PER_NODE children.
*/
struct _SpellCheckerData
{
char chr;
struct _SpellCheckerData *child;
};
/*
* Recursively delete a node.
* Note: the node itself is not deleted, but all its children does.
*/
static void scdDeleteNode(SpellCheckerDataHandle node)
{
if (node)
{
if (node->child)
{
for(int i = 0; i < MAX_CHARS_PER_NODE; ++i)
scdDeleteNode(&node->child[i]);
free(node->child);
node->child = NULL;
}
}
}
/*
* Initialize a node with the given character, and allocate memory for the
* children array.
*/
static int scdInitNode(char chr, SpellCheckerDataHandle node)
{
node->chr = chr;
node->child = calloc(MAX_CHARS_PER_NODE, sizeof(struct _SpellCheckerData));
if (!node->child)
{
printf("Failed allocating memory for child array\n");
return -1;
}
return 0;
}
static SpellCheckerDataHandle scdCreateNode(void)
{
SpellCheckerDataHandle node = malloc(sizeof(struct _SpellCheckerData));
if (!node)
{
printf("Failed allocating memory for node\n");
return NULL;
}
if (-1 == scdInitNode(0, node))
{
free(node);
return NULL;
}
return node;
}
SpellCheckerDataHandle scdInit(void)
{
return scdCreateNode();
}
void scdFinalize(SpellCheckerDataHandle data)
{
scdDeleteNode(data);
free(data);
}
/*
* Return an unsigned char instead of a signed one, and transform upper-case
* letters to lower-case.
*/
static inline unsigned char scdNormalizeChar(char c)
{
if (('A' <= c) && ('Z' >= c))
return c + 0x20;
return c;
}
/*
* Make sure a word is valid, per the requirements given in spell-checker.h.
*/
static inline int scdIsValid(const char *word)
{
for (size_t i = 0; i < strlen(word); ++i)
{
if (!( (('a' <= word[i]) && ('z' >= word[i])) ||
(('A' <= word[i]) && ('Z' >= word[i])) ||
(('0' <= word[i]) && ('9' >= word[i])) ||
(0x80 <= scdNormalizeChar(word[i])) ))
return 0;
}
return 1;
}
int scdAddWord(SpellCheckerDataHandle data, const char *word)
{
if (!data || !word || !data->child)
{
printf("Invalid arguments when trying to add a word\n");
return -1;
}
if (!scdIsValid(word))
{
#ifdef DEBUG
printf("Attempted to add an invalid word (%s) to dictionary\n", word);
#endif
return -1;
}
SpellCheckerDataHandle node = data;
size_t i = 0;
const size_t n = strlen(word);
/* Find existing nodes (characters) */
while ((i < n) && (0 != node->child[scdNormalizeChar(word[i])].chr))
{
node = &node->child[scdNormalizeChar(word[i])];
++i;
}
/* If word is longer than what we have stored, we need to add each letter */
while (i < n)
{
if (-1 == scdInitNode(scdNormalizeChar(word[i]),
&node->child[scdNormalizeChar(word[i])]))
{
/* TODO: cleanup nodes that were already created */
return -1;
}
node = &node->child[scdNormalizeChar(word[i])];
++i;
}
return 0;
}
int scdHasWord(SpellCheckerDataHandle data, const char *word)
{
if (!data || !word || !data->child)
{
printf("Invalid arguments passed to scdHasWord\n");
return -1;
}
SpellCheckerDataHandle node = data;
const size_t n = strlen(word);
size_t i = 0;
while (i < n)
{
if (0 == node->child[scdNormalizeChar(word[i])].chr)
return 0;
else
node = &node->child[scdNormalizeChar(word[i])];
++i;
}
return (i == n);
}