Hardware Upgrade Forum - View Single Post

lucas87 · 09-09-2007, 18:40

Ora posterò la prima parte del progetto che consiste nell'analizzatore lessicale...cmq voglio specificare che sotto linux funge...quindi il file è corretto.

sono 3 file, l'header, il main e il file con le funzioni chiamato dal main.

Codice:

#ifndef LEX_H
#define LEX_H

#include <stdio.h>

/* stati dell'analizzatore lessicale */
typedef enum
{
	/* analisi del contenuto di un tag */
	LEX_STATE_CONTENT,
	/* analisi di uno start-tag o end-tag */
	LEX_STATE_TAG
} lex_state;

/* struttura che rappresenta un file da analizzare */
typedef struct
{
	/* stream di input dell'analizzatore */
	FILE *file;
	/* stato dell'analizzatore */
	lex_state state;
	/* posizione del prossimo carattere dello stream */
	int riga, colonna;
} lex_an;

/* enumerazione dei tipi di token dell'analizzatore lessicale */
typedef enum
{ 
	ERROR,
	OPEN_START_TAG,
	OPEN_END_TAG,
	CLOSE_TAG,
	EQUAL,
	NAME,
	STRING
} token_type;

/* tipi di errore */
enum
{ 
	ERR_BAD_ARG = 1,
	ERR_INPUT,
	ERR_MEM,
	ERR_EOF,
	ERR_INVALID_TOKEN,
	ERR_FILE,
	ERR_GENERIC
};

/* rappresenta il token */
typedef struct
{
	/* testo del token */
	char *text;
	/* tipo di token */
	token_type type;
	/* posizione del token nel documento */
	int riga, colonna;
} token;

/* imposta il prossimo token */
int get_next_token(lex_an *pla, token *ptk);

/* crea un nuovo analizzatore lessicale */
int new_lexical_analyzer(char *fname, lex_an *pla);

/* chiude l'analizzatore lessicale */
int close_lexical_analyzer(lex_an *pla);

#endif

Codice:

#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <ctype.h>

#include "xsml_lex.h"

#define MAX_STRING_LEN 255

int prev_colonna;

char get_next_char(lex_an *la)
{
	char c = fgetc(la->file);
	
	if (c != EOF)
	{
		if (c == '\n') { la->riga++; prev_colonna = la->colonna; la->colonna = 1; }
		else la->colonna++;
	}
		
	return c;
}

void unget_char(char c, lex_an *la)
{
	ungetc(c, la->file);

	if (c == '\n') { la->riga--; la->colonna = prev_colonna; }
	else la->colonna--;
}

int is_tag_char(char c, int position)
{
	/* deve iniziare per carattere alfabetico o _ */
	if (position == 0)
		return isalpha(c) || c == '_';
	else
		return isalpha(c) || isdigit(c) || c == '-' || c == '.' || c == '_'; 
}

int get_next_tag_token(lex_an *la, token *tok)
{
	int max_string_len = MAX_STRING_LEN;

	char *str = (char *)malloc(max_string_len);
	
	/* chiude la stringa per evitare di farlo in caso di errore */
	str[0] = 0;

	/* posizione attuale all'interno del buffer str */
	int k = 0;
	
	tok->text = str;

	if (feof(la->file))
	{
		tok->type = ERROR;
		return ERR_EOF;
	} 
	
	/* legge un carattere */
	char c = get_next_char(la);
	
	/* salta eventuali caratteri di separazione */
	while(c == ' ' || c == '\n' || c == '\t')
	{
		tok->riga = la->riga;
		tok->colonna = la->colonna;

		if (feof(la->file)) return ERR_EOF;
		c = get_next_char(la);
	}

	switch(c)
	{
		case '<':
		if (feof(la->file))
		{
			tok->type = ERROR;
			return ERR_INVALID_TOKEN;
		}
		c = get_next_char(la);
		
		/* apre un end tag */
		if (c == '/')
		{
			tok->type = OPEN_END_TAG;
			if (feof(la->file))
			{
				tok->type = ERROR;
				return ERR_INVALID_TOKEN;
			}
		}
		else
		{
			/* apre uno start tag */
			tok->type = OPEN_START_TAG;
			unget_char(c, la);
		}
		
		do
		{
			c = get_next_char(la);

			/* se si e' arrivati alla fine del buffer */
			if (k == max_string_len-1)
			{
				max_string_len *= 2;
				str = realloc(str, max_string_len);
				if (str == NULL)
				{
					tok->type = ERROR;
					tok->text[k] = 0;
					return ERR_MEM;
				}
				tok->text = str;
			}
			
			/* scrive il carattere */
			str[k] = c;
		} while(!feof(la->file) && is_tag_char(c, k++));
		
		if (feof(la->file))
		{
			tok->type = ERROR;
			tok->text[k] = 0;
			return ERR_EOF;
		}
		
		if (c != '>' && c != ' ' && c != '\n' && c != '\t')
		{
			tok->type = ERROR;
			tok->text[k] = 0;
			return ERR_INVALID_TOKEN;
		}
		
		k--;
		if (k == 0)
		{
			tok->type = ERROR;
			tok->text[1] = 0;
			return ERR_INVALID_TOKEN;
		}
		if (c == '>') unget_char(c, la);
		
		str[k] = 0;
		break;
		
		/* chiude un tag */
		case '>':
		tok->type = CLOSE_TAG;
		la->state = LEX_STATE_CONTENT;
		break;
		
		case '=':
		tok->type = EQUAL;
		break;
		
		case '"':
		case '\'':
		{
			char open_quote = c;
			tok->type = STRING;
			
			if (feof(la->file))
			{
				tok->type = ERROR;
				return ERR_INVALID_TOKEN;
			}
			
			do
			{
				c = get_next_char(la);
				if (feof(la->file))
				{
					tok->type = ERROR;
					tok->text[k] = 0;
					return ERR_EOF;
				}

				/* se si e' arrivati alla fine del buffer */
				if (k == max_string_len-1)
				{
					max_string_len *= 2;
					str = realloc(str, max_string_len);
					if (str == NULL)
					{
						tok->type = ERROR;
						tok->text[k] = 0;
						return ERR_MEM;
					}
					tok->text = str;
				}

				/* scrive il carattere */
				str[k++] = c;
			}
			while(c != open_quote && c != '>' && c != '<');
			
			if (open_quote != c)
			{
				tok->type = ERROR;
				str[k] = 0;
				return ERR_INVALID_TOKEN;
			}
			else k--;
		}
		break;
		
		default:
		/* dovrebbe essere un NAME */
		tok->type = NAME;

		if (feof(la->file))
		{
			tok->type = ERROR;
			return ERR_INVALID_TOKEN;
		}
		
		while(1)
		{
			/* se sono caratteri speciali */
			if (!isdigit(c) && !isalpha(c) && c != '_' && c != '-')
			{
				if ((c == '>')||(c == '='))
				{
					unget_char(c, la);
					break;
				}
				else if ((c == ' ')||(c == '\n')||(c == '\t')) break;
				
				/* altrimenti errore */
				tok->type = ERROR;
				tok->text[k] = 0;
				return ERR_INVALID_TOKEN;
			}

			/* se si e' arrivati alla fine del buffer */
			if (k == max_string_len-1)
			{
				max_string_len *= 2;
				str = realloc(str, max_string_len);
				if (str == NULL)
				{
					tok->type = ERROR;
					tok->text[k] = 0;
					return ERR_MEM;
				}
				tok->text = str;
			}

			/* scrive il carattere */
			str[k++] = c;

			c = get_next_char(la);
			if (feof(la->file))
			{
				tok->type = ERROR;
				tok->text[k] = 0;
				return ERR_INVALID_TOKEN;
			}
		}
		
		break;
	}

	/* chiude la stringa */
	str[k] = 0;

	return 0;
}

int get_next_content_token(lex_an *la, token *tok)
{
	int max_string_len = MAX_STRING_LEN;

	char *str = (char *)malloc(max_string_len);
	
	/* chiude la stringa per evitare di farlo in caso di errore */
	str[0] = 0;

	/* posizione attuale all'interno del buffer str */
	int k = 0;
	
	tok->text = str;

	tok->type = STRING;

	/* finchï¿½ non si e' raggiunta la file del file */
	while(!feof(la->file))
	{
		/* legge un carattere */
		char c = get_next_char(la);
		
		if (feof(la->file))
		{
			if (k == 0)
			{
				tok->type = ERROR;
				tok->text[k] = 0;
				return ERR_EOF;
			}
			break;
		} 
	
		/* se si e' arrivati alla fine del buffer */
		if (k == max_string_len-1)
		{
			max_string_len *= 2;
			str = realloc(str, max_string_len);
			if (str == NULL)
			{
				tok->type = ERROR;
				tok->text[k] = 0;
				return ERR_MEM;
			}
			tok->text = str;
		}

		str[k++] = c;

		/* apertura di un tag */
		if (c == '<')
		{
			/* torna indietro di un carattere nella lettura del file */
			unget_char(c, la);
			k--;
			
			/* passa allo stato che si occupa dell'analisi del tag */
			la->state = LEX_STATE_TAG;

			if (k == 0) return get_next_tag_token(la, tok);
			break;
		}
	}
	
	str[k] = 0;
	
	return 0;
}

int get_next_token(lex_an *la, token *tok)
{
	tok->riga = la->riga;
	tok->colonna = la->colonna;

	if (feof(la->file))
	{
		tok->text = (char *)malloc(1);
		tok->text[0] = 0;
		tok->type = ERROR;
		return ERR_EOF;
	}

	switch(la->state)
	{
		case LEX_STATE_CONTENT:
			return get_next_content_token(la, tok);
		break;
			
		case LEX_STATE_TAG:
			return get_next_tag_token(la, tok);
		break;
	}
	
	return ERR_GENERIC;
}

int new_lexical_analyzer(char *filename, lex_an *la)
{
	la->file = fopen(filename, "r");
	if (la->file == NULL) return ERR_FILE;

	la->state = LEX_STATE_CONTENT;
	la->riga = 1;
	la->colonna = 1;
	
	return 0;
}

int close_lexical_analyzer(lex_an *la)
{
	fclose(la->file);
	
	return 0;
}

Codice:

#include <stdio.h>
#include <stdlib.h>
#include "xsml_lex.h"

/* numero dei tipi di token (incluso il tipo sconosciuto) */
#define TOKEN_TYPES_NUM 8

/* stringhe associate ai tipi di token */
static char *token_types[TOKEN_TYPES_NUM] = {
	"unknown token type",
	"ERROR",
	"OPEN_START_TAG",
	"OPEN_END_TAG",
	"CLOSE_TAG",
	"EQUAL",
	"NAME",
	"STRING"
};

/* restituisce la string associata al tipo di token in input */
char *token_type_to_string(int token_type)
{
	/* token sconosciuto */
	if (token_type >= TOKEN_TYPES_NUM-1) return token_types[0];
		
	/* token noto */
	return token_types[token_type+1];
}

int main(int argc, char **argv)
{
	int error_code;
	lex_an la;
	token tok;
	
	/* controlla che il numero di parametri sia 1 (il nome del file) */
	if (argc != 2)
	{
		printf("Uso: lex_an <nome_file.xsml>\n");
		return 1;
	}
	
	/* crea l'analizzatore lessicale */
	if (error_code = new_lexical_analyzer(argv[1], &la))
	{
		printf("new_lexical_analyzer() ERROR_CODE = %d\n", error_code);
		return error_code;
	}

	/* ottiene e stampa tutti i token contenuti nel file xsML */
	do
	{
		error_code = get_next_token(&la, &tok);
		if (tok.type == CLOSE_TAG || tok.type == EQUAL)
			printf("%d,%d\t%s\n",
				   tok.riga, tok.colonna, token_type_to_string(tok.type));
		else
			printf("%d,%d\t%s\t\"%s\"\n",
				   tok.riga, tok.colonna, token_type_to_string(tok.type), tok.text);
	}
	while(!error_code);

	printf("get_next_token() ERROR_CODE = %d\n", error_code);

	/* chiude l'analizzatore lessicale */
	if (error_code = close_lexical_analyzer(&la))
	{
		printf("close_lexical_analyzer() ERROR_CODE = %d\n", error_code);
		return error_code;
	}
	
	return 0;
}

Il primo è l'header, il secondo il file con le funzioni, il terzo il main.