/* recap.c -- re-capitalize input files * Tim Showalter, 30 Nov 2009 * * Inspired by the recap program on Andrew, many years ago, by Chris Newman * and/or Doug DeCarlo. I should have saved it -- I suspect that version * is superior to this one. */ #include #include #include #include #include #include #define error(fmt) (fprintf(stderr, ("recap: " fmt "\n")), exit(EX_SOFTWARE)) #define errorf(fmt, args...) \ (fprintf(stderr, ("recap: " fmt "\n"), args), exit(EX_SOFTWARE)) static const size_t MAX_WORDLEN = 128; static char **special_cases = 0; static size_t n_special_cases = 0; static size_t a_special_cases = 0; static int pstrcasecmp(const void *a1, const void *a2) { return strcasecmp(*(const char **)a1, *(const char **)a2); } static void load_special_cases(const char *fn) { FILE *f = fopen(fn, "r"); if (!f) { errorf("can't open file %s", fn); } char buf[MAX_WORDLEN]; while (fgets(buf, sizeof(buf), f) != 0) { size_t l = strlen(buf); if (buf[l - 1] == '\n') { buf[--l] = '\0'; } if (a_special_cases == n_special_cases) { if (a_special_cases == 0) a_special_cases = 128; special_cases = realloc(special_cases, (sizeof(special_cases[0]) * a_special_cases)); if (!special_cases) { error("can't reallocate memory"); } } special_cases[n_special_cases] = strdup(buf); if (!special_cases[n_special_cases]) { error("can't strdup"); } n_special_cases++; } fclose(f); qsort(special_cases, n_special_cases, sizeof(char *), pstrcasecmp); } static void write_word(int usually_upcase, char *buf, FILE *f) { const char **sc = (const char **) bsearch(&buf, special_cases, n_special_cases, sizeof(char *), pstrcasecmp); if (sc) { fputs(*sc, f); } else { char *p; for (p = buf; *p; ++p) { *p = tolower(*p); } if (usually_upcase) { buf[0] = toupper(buf[0]); } fputs(buf, f); } } static void process_nonalpha(FILE *out, FILE *in, int *start_sentence) { int ch; while ((ch = getc(in)) != EOF) { if (isalpha(ch)) { ungetc(ch, in); break; } if (ch == '?' || ch == '!' || ch == '.') { *start_sentence = 1; } else if (isdigit(ch)) { *start_sentence = 0; } putc(ch, out); } } static void process_alpha(FILE *out, FILE *in, int start_sentence) { char buf[MAX_WORDLEN]; int ch; int i = 0; while ((ch = getc(in)) != EOF) { if (i >= sizeof(buf) - 1 || !isalpha(ch)) { ungetc(ch, in); break; } buf[i++] = ch; } if (i) { buf[i] = '\0'; write_word(start_sentence, buf, out); } } static void recap(FILE *out, FILE *in) { int start_sentence = 1; while (!feof(in) && !ferror(in) && !feof(out) && !ferror(out)) { process_nonalpha(out, in, &start_sentence); process_alpha(out, in, start_sentence); start_sentence = 0; } } static void usage() { fprintf(stderr, "\ recap: a program to re-capitalize input files in accordance with standard\n\ grammar\n\ \n\ Usage: recap [opts] < INPUT > OUTPUT\n\ \n\ Options:\n\ -s FILENAME ...... special cases (proper nouns) to be capitalized\n\ even when they aren't at the beginning of a sentence\n\ "); } int main(int argc, char *argv[]) { { int ch; while (-1 != (ch = getopt(argc, argv, "hs:"))) { switch (ch) { case 'h': usage(); return EX_USAGE; case 's': load_special_cases(optarg); break; default: errorf("internal error--unknown character %c", ch); break; } } } recap(stdout, stdin); return 0; } /* * * * ASCII only, English only. 8 bit data will confuse the tokenizer. * * Tim Showalter * */