Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

1 /*********************************************

2 * Id: gust1185
3 *
4 * Compile: gcc -Wall
5 * Run: ./a.out input.txt
6 *
7 * Reads program line-by-line and outputs it in individual tokens and strings,
8 * comments (including multiline), and characters.
9 *********************************************/
10
11 #define MAXTOKEN 256
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <ctype.h>
17
18 // Assign meaning to tokens
19 void lex(char *line, int start, int end, int length, char* type, int removeLeading );
20
21 // Break the string down into tokens
22 void tokenize(char *line, int length);
23
24 //checks if a token is a keyword
25 int strIsKeyword( char *str );
26 int strIsNumeric( char *str );
27 int strIsIdentifier( char *str );
28
29 int main(int argc, char *argv[]){
30 if( argc < 2 ){
31
32 printf("Please specify input file.\n");
33 printf("%s /y/shared/Engineering/cs-drbc/assignments/cs210/w01_in1.txt\n", argv[0]);
34 return 1;
35
36 }
37 FILE * fp;
38 char * line = NULL;
39 size_t len = 0;
40 ssize_t read;
41
42 fp = fopen(argv[1], "r");
43 if (fp == NULL){
44 printf("Error: Could not open file %s\n", argv[1]);
45 exit(EXIT_FAILURE);
46 }
47
48 while ( ( read = getline( &line, &len, fp ) ) != -1 ) {
49 tokenize(line, read);
50 }
51
52 fclose(fp);
53 if (line) {
54 free(line);
55 }
56 exit(EXIT_SUCCESS);
57 }
58
59 // Assign meaning to tokens
60 void lex(char *line, int start, int end, int length, char* type, int removeLeading ){
61
62 char token[MAXTOKEN];
63 if ( end >= length ){
64 end = length - 1;
65 }
66 if ( removeLeading == 1 ){
67 while( isspace( line[start] ) && start < length ){
68 start++;
69 }
70 }
71 if ( start > end ){
72 return;
73 }
74 strncpy( token, &line[start], end - start );
75 token[ end - start ] = '\0';
76 if ( token[0] == '\0' ) {
77 return;
78 }
79 //check if token is an identifier
80 if ( strIsIdentifier( token ) ){
81 type = "Identifier";
82 }
83 //check if token is a numeric literal
84 if( strIsNumeric( token ) ){
85 type = "Numeric";
86 }
87 if ( strIsKeyword( token ) ){
88 type = "Keyword";
89 }
90 printf("%s: %s\n", type, token);
91
92 }
93
94 // Break the string down into tokens
95 void tokenize(char *line, int length){
96 //start at begining of line
97 int start = 0;
98 int end = 0;
99 static int multilineComment = 0;
100
101 for ( end = 0; end < length; end++ ){ //comment should be checked for first
102 if ( (line[end] == '/' && line[end + 1] == '*') || multilineComment ) { //comments
103 if ( multilineComment == 0 ){
104 lex( line, start, end, length, "Token", 1 );
105 }
106 start = end;
107 while( ( end < length ) && ( line[end] != '/' || line[end-1] != '*' ) ){
//demorgans law
108 end++;
109 }
110 //set temp to the oposite of multiLineComment
111 //if it is true, then allow lex function to remove leading
112 //otherwise, tell lex to not remove leading
113 int temp = !multilineComment;
114 if (end >= length ) {
115 multilineComment = 1;
116 // temp = 0;
117 } //zzz
118 else {
119 multilineComment = 0;
120 // temp = 1;
121 }
122 end++;
123 lex( line, start, end, length, "Comment", temp );
124 start = end;
125 end--;//zzz multiline still doesnt work
126 }
127 else if ( (line[end] == ':' && line[end+1] == '=') || (line[end] == '.' && line[end+
1] == '.') || (line[end] == '<' && line[end+1] == '<') ) {
128 //double character operators
129 lex( line, start, end, length, "Token", 1 );
130 start = end;
131 end++;
132 end++;
133 lex( line, start, end, length, "Operator", 1 );
134 start = end;
135 end--;
136 }
137 else if ( (line[end] == '>' && line[end+1] == '>') || (line[end] == '<' && line[end+
1] == '>') || (line[end] == '<' && line[end+1] == '=') ) {
138 //double char operators continued
139 lex( line, start, end, length, "Token", 1 );
140 start = end;
141 end++;
142 end++;
143 lex( line, start, end, length, "Operator", 1 );
144 start = end;
145 end--;
146 }
147 else if ( (line[end] == '>' && line[end+1] == '=') || (line[end] == '*' && line[end+
1] == '*') || (line[end] == '!' && line[end+1] == '=') ) {
148 //double char operators continued
149 lex( line, start, end, length, "Token", 1 );
150 start = end;
151 end++;
152 end++;
153 lex( line, start, end, length, "Operator", 1 );
154 start = end;
155 end--;
156 }
157 else if ( (line[end] == '=' && line[end+1] == '>') || (line[end] == '{' && line[end+
1] == ':') || (line[end] == '}' && line[end+1] == ':') ) {
158 //double char operators continued
159 lex( line, start, end, length, "Token", 1 );
160 start = end;
161 end++;
162 end++;
163 lex( line, start, end, length, "Operator", 1 );
164 start = end;
165 end--;
166 }
167 else if ( line [end] == '<' || line [end] == '>' || line [end] == '(' || line [end]
== ')' || line [end] == '+' || line [end] == '-' || line [end] == '*' ){
168 //operator
169 lex( line, start, end, length, "Token", 1 );
170 start = end;
171 end++;
172 lex( line, start, end, length, "Operator", 1 );
173 start = end;
174 end--;
175 }
176 else if ( line [end] == '/' || line [end] == '|' || line [end] == '&' || line [end]
== ';' || line [end] == ',' || line [end] == ':' || line [end] == '=' ){
177 //operator
178 lex( line, start, end, length, "Token", 1 );
179 start = end;
180 end++;
181 lex( line, start, end, length, "Operator", 1 );
182 start = end;
183 end--;
184 }
185 else if ( line [end] == '$' || line [end] == '@' || line [end] == '[' || line [end]
== ']' || line [end] == '{' || line [end] == '}' ){
186 //operator
187 lex( line, start, end, length, "Token", 1 );
188 start = end;
189 end++;
190 lex( line, start, end, length, "Operator", 1 );
191 start = end;
192 end--;
193 }
194 else if ( line[end] == '\'' ){ //char
195 lex( line, start, end, length, "Token", 1 );
196 start = end;
197 end++; //dont start on the first quotation so while loop can run
198 while( ( end < length ) && line[end] != '\'' ){
199 end++;
200 }
201 if ( line[end + 1] == '\'' && line[end-1] == '\\' ){
202 end++;
203 }
204 end++;
205 lex( line, start, end, length, "Char", 1 );
206 start = end;
207 end--;
208 }
209 else if ( line[end] == '"' ){ //strings
210 lex( line, start, end, length, "Token", 1 );
211 start = end;
212 end++; //dont start on the first quotation so while loop can run
213 while( ( end < length ) && line[end] != '"' ){
214 end++;
215 }
216 end++;
217 lex( line, start, end, length, "String", 1 );
218 start = end;
219 end--;
220 }
221 else if( isspace( line[end] ) ){ //this stays as last
222 lex( line, start, end, length, "Token", 1 );
223 start = end;
224 }
225 }
226 }
227
228 int strIsKeyword( char *str ){
229
230 if ( strcmp( str, "accessor" ) == 0 ){
231 return 1;
232 }
233 else if ( strcmp( str, "and" ) == 0 ){
234 return 1;
235 }
236 else if ( strcmp( str, "array" ) == 0 ){
237 return 1;
238 }
239 else if ( strcmp( str, "bool" ) == 0 ){
240 return 1;
241 }
242 else if ( strcmp( str, "character" ) == 0 ){
243 return 1;
244 }
245 else if ( strcmp( str, "constant" ) == 0 ){
246 return 1;
247 }
248 else if ( strcmp( str, "else" ) == 0 ){
249 return 1;
250 }
251 else if ( strcmp( str, "elsif" ) == 0 ){
252 return 1;
253 }
254 else if ( strcmp( str, "end" ) == 0 ){
255 return 1;
256 }
257 else if ( strcmp( str, "exit" ) == 0 ){
258 return 1;
259 }
260 else if ( strcmp( str, "float" ) == 0 ){
261 return 1;
262 }
263 else if ( strcmp( str, "func" ) == 0 ){
264 return 1;
265 }
266 else if ( strcmp( str, "if" ) == 0 ){
267 return 1;
268 }
269 else if ( strcmp( str, "ifc" ) == 0 ){
270 return 1;
271 }
272 else if ( strcmp( str, "in" ) == 0 ){
273 return 1;
274 }
275 else if ( strcmp( str, "integer" ) == 0 ){
276 return 1;
277 }
278 else if ( strcmp( str, "is" ) == 0 ){
279 return 1;
280 }
281 else if ( strcmp( str, "mutator" ) == 0 ){
282 return 1;
283 }
284 else if ( strcmp( str, "natural" ) == 0 ){
285 return 1;
286 }
287 else if ( strcmp( str, "null" ) == 0 ){
288 return 1;
289 }
290 else if ( strcmp( str, "of" ) == 0 ){
291 return 1;
292 }
293 else if ( strcmp( str, "or" ) == 0 ){
294 return 1;
295 }
296 else if ( strcmp( str, "others" ) == 0 ){
297 return 1;
298 }
299 else if ( strcmp( str, "out" ) == 0 ){
300 return 1;
301 }
302 else if ( strcmp( str, "pkg" ) == 0 ){
303 return 1;
304 }
305 else if ( strcmp( str, "positive" ) == 0 ){
306 return 1;
307 }
308 else if ( strcmp( str, "proc" ) == 0 ){
309 return 1;
310 }
311 else if ( strcmp( str, "ptr" ) == 0 ){
312 return 1;
313 }
314 else if ( strcmp( str, "range" ) == 0 ){
315 return 1;
316 }
317 else if ( strcmp( str, "subtype" ) == 0 ){
318 return 1;
319 }
320 else if ( strcmp( str, "then" ) == 0 ){
321 return 1;
322 }
323 else if ( strcmp( str, "type" ) == 0 ){
324 return 1;
325 }
326 else if ( strcmp( str, "when" ) == 0 ){
327 return 1;
328 }
329 else if ( strcmp( str, "while" ) == 0 ){
330 return 1;
331 } else {
332 return 0;
333 }
334 }
335
336 int strIsIdentifier( char *str ){
337 int i = 0;
338 int badCharCounter = 0;
339 if ( isalpha( str[0] ) ){
340 while( str[i] != '\0' ){
341 if( str[i] == '_' || isdigit(str[i]) || isalpha(str[i]) ){
342 //okay
343 } else {
344 badCharCounter++;
345 }
346 i++;
347 }
348 if( badCharCounter < 1 ){
349 return 1;
350 }
351 }
352 return 0;
353 }
354
355 int strIsNumeric( char *str ){
356 int i = 0; //counter
357 int periodCounter = 0;
358 int wrongCharacters = 0;
359 while( str[i] != '\0' && wrongCharacters < 1 && periodCounter < 2 ){
360 if( isxdigit(str[i]) || str[i] == '.' ){
361 //okay
362 }
363 else {
364 //invalid character detected!!!!
365 wrongCharacters++;
366 }
367 if ( str[i] == '.' ){
368 periodCounter++;
369 }
370 i++;
371 }
372
373 if ( wrongCharacters < 1 && periodCounter < 2 ){
374 return 1;
375 }
376 else {
377 return 0;
378 }
379 }
380

You might also like