-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsolution.c
More file actions
251 lines (219 loc) · 7.77 KB
/
solution.c
File metadata and controls
251 lines (219 loc) · 7.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/*
* This problem asks me to read a text file as input and count the number of
* occurrences of given HTTP headers. Headers are found at the start of each
* line, prior to a colon in that line. An additional objective is to devise
* an algorithm for CPU computational efficiency; doing so at the expense of
* memory is acceptable.
*
* To solve this problem, I will make use of an n-ary tree data structure with
* an integer being stored at each branch and leaf. The position of each
* node's branch will correspond with the subsequent character. To illustrate:
*
* Initially, there is a root node. The initial node has pointers to future
* branches, but no memory is allocated for the branches yet.
*
* <root>
* / | | \
* (A)(B)(..)(Z) <-The parenthesis means there is a pointer that can
* point to a node, but is currently pointing to NULL.
*
* Suppose, as our header string, we received the simple "A". The first node,
* corresponding to the character "A", would be initialized in memory and a
* value of 1 would be assigned to the character.
*
* <root>
* / | | \
* A (B)(..)(Z) <- Words beginning in B through Z have not been seen yet.
* V
* 1 <- "A" observed once.
*
* Next, suppose the next two header strings were "ABBA". That would fill
* the tree much deeper, and it would look like the following:
*
* <root>
* / | | \
* A (B)(..)(Z)
* V
* /-1-\ <- "A" observed once.
* / | \
* (A) B (..) <- "AB" observed zero times; the node value, zero, isn't shown.
* /|\
* (A) B (..) <- "ABB" observed zero times; the node value, zero, isn't shown.
* / \
* A (..)
* V
* 2 <- "ABBA" observed twice.
*
* This procedure is repeated until the entire input file is read. Completing
* this operation requires each input string to be read once before its
* correct positioning in the tree can be found and the node, there,
* incremented. Consequently, this operation is completed in O(N) time, where
* N corresponds to the number of lines of the input file.
*
* Finally, we must find the value at the leaf node of the header files being
* tracked. To do this, we simply travel down the tree, as we did before,
* but we measure the stored value at the leaf rather than incrementing. This
* operation is completed in O(K) time, where K is the number of headers that
* are being tracked. Assuming that the input sample is much larger than the
* number of tracked headers (N >> K), the input sample is the dominant strain
* on computation.
*
* One shortcoming of using trees in this way is that it is memory-
* inefficient. Each node has one leaf pointer for each possible child,
* even though typically most of those pointers do not point to any
* memory in use. The usage of memory could be improved by using a dynamic
* array that could grow in size as the leaves are filled. However, I do not
* think it is possible to effectively use dynamic arrays without reducing CPU
* performance.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/*
* The node structure is used to create an n-ary tree with a 'count' variable
* at each node. This tree is used to keep track of which headers' words are
* observed and how many times for each observation. Space is
* wasted by using 128 pointers per node, corresponding to the char
* values 0-127. In principle, only a-z, A-Z, and '-' are used in the
* testing set, so the memory usage could be reduced to 26+26+1=53
* (or 26+1 if case sensitivity were dropped). However, this would
* require a mapping of each character to a position in an array, which
* would slightly slow the program.
*/
const int numChars = 128;
struct node {
int count;
struct node *nextChar[numChars];
};
FILE *openFile(int, char*[]);
struct node *readFile(FILE *);
struct node *createNode();
void printResults(struct node*, int, char *headers[]);
void addWordToTree(struct node* root, char *word);
int getWordcountFromTree(struct node* root, char *word);
void freeMemory(struct node*);
int main(int argc, char* argv[]) {
// Headers-to-track are chosen at compilation-time
int numHeaders = 5;
char *headers[numHeaders];
headers[0] = "Connection";
headers[1] = "Accept";
headers[2] = "Content-Length";
headers[3] = "CableModems-testCase";
headers[4] = "Content";
// Open input file
FILE *ptr_file = openFile(argc, argv);
if(!ptr_file)
return 1;
// Read input file, build tree of headers
struct node* root = readFile(ptr_file);
// Print the number of occurences of the headers in the tree.
printResults(root, numHeaders, headers);
// Close file
fclose(ptr_file);
// Free memory from tree
freeMemory(root);
// Success
return 0;
}
/*
* openFile opens the file supplied at run-time and returns the file's pointer.
*/
FILE *openFile(int argc, char* argv[]) {
const char *fileInput = argv[1];
if(argc != 2) {
printf("Please supply one argument for input filename.\n");
printf("E.g.: ./solution inputFile.txt\n");
return NULL;
}
FILE *ptr_file = fopen(fileInput, "r");
if (!ptr_file) {
printf("Error opening input file.\n");
return NULL;
}
return ptr_file;
}
/*
* The readFile function Reads the input text file, line by line, and builds
* up the tree of word-count.
*/
struct node *readFile(FILE *ptr_file) {
char buf[1000]; // Artibrary length; assumed longer than any header line
struct node* root = createNode();
while (fgets(buf,1000, ptr_file)!=NULL) {
// Find the position of the first colon in the string
char *colon = strchr(buf, ':');
// Terminate the string at the location of the first colon
buf[colon-buf] = '\0';
addWordToTree(root, buf);
}
return root;
}
/*
* The addWordToTree function will find the location on the tree corresponding
* to the passed word and incriment the count by 1. If the tree doesn't
* have branches corresponding to the letters of the word, the tree's missing
* branches are first built by this function.
*/
void addWordToTree(struct node* root, char *word) {
struct node* curNode = root;
int pos = 0;
while(word[pos] != '\0') {
if(curNode->nextChar[word[pos]] == NULL)
curNode->nextChar[word[pos]] = createNode();
curNode = curNode->nextChar[word[pos]];
pos += 1;
}
curNode->count += 1;
}
/*
* The printResults function prints the number of occurences of watched
* headers to stdout.
*/
void printResults(struct node* root, int nHeaders, char *headers[]) {
for(int i = 0; i < nHeaders; i++) {
printf("'%s' seen %d times.\n", headers[i],
getWordcountFromTree(root, headers[i]));
}
}
/*
* The getWordcountFromTree function finds the tree's leaf corresponding to
* the word that was passed and returns the count at that node. If the tree
* isn't built out to that leaf, it is due to that word not being observed in
* a header file, so the count (i.e. 0) is returned.
*/
int getWordcountFromTree(struct node* root, char *word) {
struct node* curNode = root;
int pos = 0;
while(word[pos] != '\0') {
if(curNode->nextChar[word[pos]] == NULL)
return 0;
curNode = curNode->nextChar[word[pos]];
pos += 1;
}
return curNode->count;
}
/*
* The createNode function allocates memory and initializes a node struct.
*/
struct node *createNode() {
struct node* theNode = (struct node*) malloc(sizeof(struct node));
// initialize values
theNode->count = 0;
for(int i = 0; i < numChars; i++) {
theNode->nextChar[i] = NULL;
}
return theNode;
}
/*
* The freeMemory function recursively, by depth first, frees memory from
* the tree.
*/
void freeMemory(struct node* theNode) {
for(int i = 0; i < numChars; i++) {
if(theNode->nextChar[i] != NULL) {
freeMemory(theNode->nextChar[i]);
}
}
free(theNode);
}