/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Manipulates a file of strings as a vocabulary, assigning a dense index to each string. * The offset of the index can be specified with a "%first_value n" string at the beginning * of the file, and a special "unknown word" can be specified with a "%spelling_of_unknown FOO" * string. * * J. Lafferty 2/13/95 * adapted to be called from matlab L.K. 10/27/2004 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include "math.h" #include "string.h" #include "ctype.h" #include "mxutil.h" #include "mxvocabulary.h" #define MAX_VOCS 256 #define MAX_SPELL_LENGTH 80 typedef struct hte hte; struct hte { char* spelling; int value; int count; hte* next; }; typedef struct voc_data voc_data; struct voc_data { char* file_name; int first_value; int last_value; int size; int htsize; int has_unknown; char* spelling_of_unknown; int value_of_unknown; hte** entry; hte** ht; char is_new; }; int num_vocs_open=0; voc_data voc[MAX_VOCS]; static int isprime (int p) { int i; for(i=2; i<=(int) sqrt((double)p); ++i) if ((p % i) == 0) return 0; return 1; } static int ceilprime(int n) { while (1) { if (isprime(n)) return n; n++; } } static int already_open(int* u, const char* fn) { int i; for (i=0; inext) { if (strcmp(x->spelling, spelling) == 0) { /*error("VOC: duplicate spelling: \"%s\" in vocabulary %s\n", spelling, voc[unit].file_name); */ mexErrMsgTxt("VOC: duplicate spelling in vocabulary"); } } x = (hte*) malloc(sizeof(hte)); x->spelling = (char*) malloc(sizeof(char)*(strlen(spelling)+1)); strcpy(x->spelling, spelling); x->count = 0; x->next = voc[unit].ht[h]; x->value = ++voc[unit].last_value; voc[unit].size++; voc[unit].ht[h] = x; voc[unit].entry[x->value] = x; if (voc[unit].has_unknown && (strcmp(voc[unit].spelling_of_unknown, spelling) == 0)) { voc[unit].value_of_unknown = voc[unit].last_value; } return x->value; } int voc_open(int* u, const char* fn) { FILE *fp; int i, c, unit; char spelling[MAX_SPELL_LENGTH], *p; if (already_open(u, fn)) return 1; fp = fopen(fn, "r"); if (fp == NULL) { /*error("VOC_OPEN: Cannot open vocabulary %s\n", fn);*/ mexErrMsgTxt("VOC_OPEN: Cannot open vocabulary"); } unit = num_vocs_open++; *u = unit; if (num_vocs_open == MAX_VOCS) { /*error("VOC_OPEN: Too many vocabularies open (max %d)\n", MAX_VOCS);*/ mexErrMsgTxt("VOC_OPEN: Too many vocabularies open"); } voc[unit].file_name = (char*) malloc(sizeof(char)*(strlen(fn)+1)); strcpy(voc[unit].file_name, fn); voc[unit].size = 0; voc[unit].first_value = 0; voc[unit].last_value = -1; voc[unit].has_unknown = 0; voc[unit].spelling_of_unknown = "***"; voc[unit].value_of_unknown = -1; voc[unit].is_new = 0; /* first pass sets parameters and gets size */ while (1) { c = fgetc(fp); if ((c != EOF) && (ungetc(c, fp) == EOF)) { /*error("VOC_OPEN: cannot ungetc\n");*/ mexErrMsgTxt("VOC_OPEN: cannot ungetc"); } if (fgets(spelling, MAX_SPELL_LENGTH, fp) == NULL) break; spelling[strlen(spelling)-1] = '\0'; /* get rid of newline */ if (c == '%') { if (strcmp(spelling, "%no_unknown") == 0) { voc[unit].has_unknown = 0; } else if (strstr(spelling, "%spelling_of_unknown") != NULL) { p = strtok(spelling, " "); p = strtok(NULL, " "); voc[unit].has_unknown = 1; voc[unit].spelling_of_unknown = (char *) malloc(sizeof(char)*(strlen(p)+1)); strcpy(voc[unit].spelling_of_unknown, p); } else if (strstr(spelling, "%first_value") != NULL) { p = strtok(spelling, " "); p = strtok(NULL, " "); voc[unit].first_value = atoi(p); } } else { voc[unit].size++; } } rewind(fp); voc[unit].htsize = ceilprime(2*voc[unit].size); voc[unit].ht = (hte**) malloc(sizeof(hte*)*voc[unit].htsize); voc[unit].entry = (hte**) malloc(sizeof(hte*)*voc[unit].size); for (i=0; ispelling); } fclose(fp); } /* tables should be freed here!! */ } /* modified -LK */ int voc_open_new(int* u, const char* fn, int size) { FILE *fp; int i, c, unit; char spelling[MAX_SPELL_LENGTH], *p; if (fn!=NULL) { if (already_open(u, fn)) { /*error("VOC_OPEN_NEW: Already opened vocabulary %s", fn);*/ mexErrMsgTxt("VOC_OPEN_NEW: Already opened vocabulary"); } if (qfilef(fn)) { printf("VOCABULARY: %s already exists\n", fn); } } unit = num_vocs_open++; *u = unit; if (num_vocs_open == MAX_VOCS) { /*error("VOC_OPEN: Too many vocabularies open (max %d)\n", MAX_VOCS);*/ mexErrMsgTxt("VOC_OPEN: Too many vocabularies open"); } if (fn != NULL) { voc[unit].file_name = (char*) malloc(sizeof(char)*(strlen(fn)+1)); strcpy(voc[unit].file_name, fn); } else { voc[unit].file_name = ""; } voc[unit].size = 0; voc[unit].first_value = 0; voc[unit].last_value = -1; voc[unit].has_unknown = 0; voc[unit].spelling_of_unknown = ""; voc[unit].value_of_unknown = -1; voc[unit].htsize = size; voc[unit].ht = (hte**) malloc(sizeof(hte*)*size); voc[unit].entry = (hte**) malloc(sizeof(hte*)*size); voc[unit].is_new = 1; for (i=0; ivoc[u].last_value)) /*error("VOC_SPELLING: out of bounds index %i for vocabulary %s\n", i, voc[u].file_name);*/ mexErrMsgTxt("VOC_SPELLING: out of bounds index for vocabulary"); return voc[u].entry[i-voc[u].first_value]->spelling; } int voc_value(const int u, const char* w) { int h; hte* x; h = hash(u, w); for (x = voc[u].ht[h]; x != NULL; x = x->next) { if (strcmp(w, x->spelling) == 0) { x->count++; return voc[u].first_value + x->value; } } /*error("VOC_VALUE: Spelling %s is not in vocabulary %s\n", w, voc[u].file_name);*/ mexErrMsgTxt("VOC_VALUE: Spelling is not in vocabulary"); return -1; } int voc_qvalue(const int u, const char* w, int* found) { int h; hte* x; h = hash(u, w); for (x = voc[u].ht[h]; x != NULL; x = x->next) { if (strcmp(w, x->spelling) == 0) { x->count++; *found = 1; return voc[u].first_value + x->value; } } *found = 0; return voc[u].has_unknown ? (voc[u].first_value + voc[u].value_of_unknown) : -1; } int voc_add(const int u, const char* w) { int h; hte* x; h = hash(u, w); for (x = voc[u].ht[h]; x != NULL; x = x->next) { if (strcmp(w, x->spelling) == 0) { x->count++; return voc[u].first_value + x->value; } } return voc[u].first_value + add_to_ht(u, w); } int voc_count (const int u, int i) { hte* x; if ((ivoc[u].last_value)) /*error("VOC_COUNT: out of bounds index %i for vocabulary %s\n", i, voc[u].file_name);*/ mexErrMsgTxt("VOC_COUNT: out of bounds index for vocabulary"); x = voc[u].entry[i]; return x->count; } int voc_first_value (const int u) { return voc[u].first_value; } int voc_last_value (const int u) { return voc[u].last_value; } int voc_size (const int u) { return voc[u].size; } int voc_has_unknown (int u) { return voc[u].has_unknown; } char *voc_spelling_of_unknown(int u) { if (voc[u].has_unknown == 0) /*error("VOC: vocabulary %s has no unknown\n", voc[u].file_name);*/ mexErrMsgTxt("VOC: vocabulary has no unknown"); return voc[u].spelling_of_unknown; } int voc_value_of_unknown(int u) { if (voc[u].has_unknown == 0) /*error("VOC: vocabulary %s has no unknown\n", voc[u].file_name);*/ mexErrMsgTxt("VOC: vocabulary has no unknown\n", voc[u].file_name); return voc[u].value_of_unknown; } int voc_is_unknown(const int u, const char *w) { return (voc[u].has_unknown && !strcmp(voc[u].spelling_of_unknown,w)); }