/* A lexer with special features for handling e-mail and newsgroup messages. Specifically, this lexer will remove headers which are in the BOW_EMAIL_HEADERS_TO_REMOVE array. Copyright (C) 1997 Andrew McCallum and Jason Rennie Written by: Andrew Kachites McCallum and Jason Rennie This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */ #include #include /* for tolower() */ char **bow_email_headers_to_remove = NULL; /* String compare function used to compare headers. Ignores case distinctions 1 => strings don't match. 0 => strings do match. */ int strcmp_ignore_case (char *str1, char *str2) { int pos = 0; assert(str1 != NULL); assert(str2 != NULL); while (tolower(str1[pos]) == tolower(str2[pos])) { if (str1[pos] == '\0') { return 0; } pos++; } return 1; } /* Assumes that lex->document[lex->document_position] is at the beginning of a newline (immediately after a '\n'). Compares the characters up to (but not including) the first colon against each string in the BOW_EMAIL_HEADERS_TO_REMOVE array. If any match is found, the document_position value is moved to the beginning of the next line. Otherwise, returns document_position to where it was */ void bow_lexer_email_check_header (bow_lex *lex) { int old_document_position = lex->document_position; char *header = bow_malloc(sizeof(char) * BOW_MAX_WORD_LENGTH); int header_position = 0; char byte; int i; /* Put header name into HEADER character array. Stop at a non-printable character, a colon, or an overflow of characters */ do { byte = lex->document[lex->document_position++]; header[header_position++] = byte; } while (((byte >= 33 && byte <= 57) || (byte >= 59 && byte <= 126)) && header_position < BOW_MAX_WORD_LENGTH); /* If it is a proper header, check to see if it is in the list, else return the document position back to its original value. See rfc822 for information on lexing e-mail headers. */ if (byte == 58) { /* Step through NULL-terminated array. */ for (i = 0; bow_email_headers_to_remove[i]; i++) if (strcmp_ignore_case (bow_email_headers_to_remove[i], header) == 0) { while (lex->document[lex->document_position] != '\n' || lex->document[lex->document_position+1] == 32 || lex->document[lex->document_position+1] == 9) { lex->document_position++; } break; } } else { lex->document_position = old_document_position; } free(header); return; } /* Get the raw token from the document buffer by scanning forward until we get a start character, and filling the buffer until we get an ending character. The resulting token in the buffer is NULL-terminated. Return the length of the token. */ int bow_lexer_email_get_raw_word (bow_lexer_simple *self, bow_lex *lex, char *buf, int buflen) { int byte; /* characters read from the FP */ int wordlen; /* number of characters in the word so far */ /* Ignore characters until we get a beginning character. */ do { byte = lex->document[lex->document_position++]; if (byte == 0) return 0; if (byte == '\n') bow_lexer_email_check_header (lex); } while (! self->true_to_start (byte)); /* Add the first alphabetic character to the word. */ buf[0] = (self->case_sensitive) ? byte : tolower (byte); /* Add all the following satisfying characters to the word. */ for (wordlen = 1; wordlen < buflen; wordlen++) { byte = lex->document[lex->document_position++]; if (byte == 0) return 0; if (byte == '\n') bow_lexer_email_check_header (lex); if (! self->false_to_end (byte)) break; buf[wordlen] = tolower (byte); } if (wordlen >= buflen) bow_error ("Encountered word longer than buffer length=%d", buflen); /* Back up to point at the character that caused the end of the word. */ lex->document_position--; /* Terminate it. */ buf[wordlen] = '\0'; return wordlen; } /* Scan a single token from the LEX buffer, placing it in BUF, and returning the length of the token. BUFLEN is the maximum number of characters that will fit in BUF. If the token won't fit in BUF, an error is raised. */ int bow_lexer_email_get_word (bow_lexer *self, bow_lex *lex, char *buf, int buflen) { #define SELF ((bow_lexer_indirect*)self) int wordlen; /* number of characters in the word so far */ do { /* Yipes, this UNDERLYING_LEXER had better be a BOW_LEXER_SIMPLE! */ wordlen = bow_lexer_email_get_raw_word ((bow_lexer_simple*)SELF->underlying_lexer, lex, buf, buflen); if (wordlen == 0) return 0; } while ((wordlen = bow_lexer_simple_postprocess_word ((bow_lexer_simple*)SELF->underlying_lexer, lex, buf, buflen)) == 0); return wordlen; #undef SELF } /* This is declared in lex-simple.c */ extern bow_lexer_simple _bow_alpha_lexer; /* A lexer that ignores headers specified in BOW_EMAIL_HEADERS_TO_REMOVE */ const bow_lexer_indirect _bow_email_lexer = { { sizeof (typeof (_bow_email_lexer)), bow_lexer_simple_open_text_fp, bow_lexer_email_get_word, bow_lexer_simple_close, "", /* document start pattern begins right away */ NULL /* document end pattern goes to end */ }, (bow_lexer*)&_bow_alpha_lexer, /* default UNDERLYING_LEXER */ }; const bow_lexer_indirect *bow_email_lexer = &_bow_email_lexer;