diff options
Diffstat (limited to 'hyp.c')
-rw-r--r-- | hyp.c | 262 |
1 files changed, 262 insertions, 0 deletions
@@ -0,0 +1,262 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <hyphen.h> +#include "config.h" + +/* Read a tag into a character array and return its length. */ +int +readtag(char *tag, FILE *in, FILE *out) +{ + char ch; + int i = 0; + + /* Data after a space in a tag is irrelevant. */ + while ((ch = fgetc(in)) != '>' && ch != ' ') { + fputc(ch, out); + tag[i] = ch; + ++i; + } + + if (ch == ' ') { + /* Seek to the end of the tag. */ + while (ch != '>' + && ch != EOF) { + fputc(ch, out); + ch = fgetc(in); + } + if (ch == EOF) { + return -1; + } + fputc(ch, out); + } + else + fputc(ch, out); + + tag[i] = '\0'; + return i; +} + +int +cmptag(char *tag2, FILE *in, FILE *out) +{ + char tag1[MAXWLEN]; + readtag(tag1, in, out); + + return strcmp(tag1, tag2); +} + +int +checktag(char *tag, int tagamt, FILE *in, FILE *out) +{ + int i; + for (i = 0; i < tagamt; ++i) + if (strcmp(tag, taglist[i]) == 0) + return i; + return -1; +} + +/* Check if a character should be skipped. */ +int +checkskip(char ch, int skiplen) +{ + int i; + for (i = 0; i < skiplen; ++i) + if (ch == skip[i]) + return 1; + + return 0; +} + +const char *punct = "';.,\"!?:"; +/* Check if a character is punctuation. */ +int +checkpunct(char ch) +{ + int i; + for (i = 0; punct[i] != '\0'; ++i) + if (ch == punct[i]) + return 1; + + return 0; +} + +const char *blank = " \n\r\t"; +/* Check if a character is a blank. */ +int +checkblank(char ch) +{ + int i; + for (i = 0; blank[i] != '\0'; ++i) + if (ch == blank[i]) + return 1; + + return 0; +} + +/* Loop until the body is found. */ +int +findbody(FILE *in, FILE *out) +{ + char ch; + while ((ch = fgetc(in)) != EOF) { + fputc(ch, out); + if (ch == '<' + && cmptag("body", in, out) == 0) + return 1; + } + + return 0; +} + +/* Hyphenate a word, by means of hyphen library. +This is done so as to leverage sufficient hyphenation +patterns, with the ones used here having been taken +from those developed for TeX. */ +void +hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict) +{ + if (len < MINWLEN) { + fprintf(out, "%s", word); + return; + } + + char *hyphens = calloc(len + 6, sizeof(char)); + char *hyphword = calloc(len << 1, sizeof(char)); + char **rep = NULL; + int *pos = NULL, *cut = NULL; + hnj_hyphen_hyphenate2(dict, word, len, hyphens, + hyphword, &rep, &pos, &cut); + +/* fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */ + + /* Process the given hyphenation. */ + int i; + char oldch = ' ', oldoldch = '\0'; + for (i = 0; i < strlen(hyphword); ++i) { + if (hyphword[i] == '=' && oldch != ' ' + && oldoldch != ' ') + fputs("­", out); + else if (hyphword[i] != '=') + fputc(hyphword[i], out); + + oldoldch = oldch; + oldch = hyphword[i]; + } + + free(hyphens); + free(hyphword); +} + +/* Hyphenate the words within a tag. */ +void +hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict) +{ + char ch, word[MAXWLEN], term[MAXWLEN] = "/"; + int i = 0; + + strcat(term, tag); + + while ((ch = fgetc(in)) != EOF) { + if (i < 0) { + fputc(ch, out); + ++i; + if (checkskip(ch, skiplen)) i -= 3; + continue; + } + if (checkblank(ch)) { + word[i] = '\0'; + hypword(word, i, in, out, dict); + i = 0; + } + if (checkpunct(ch)) { + word[i] = '\0'; + hypword(word, i, in, out, dict); + fputc(ch, out); + i = 0; + } + else if (checkskip(ch, skiplen)) { + word[i] = '\0'; + fputs(word, out); + fputc(ch, out); + /* A simple way of working around + HTML character codes. Each is 5 ( epsiv ) + or 6 ( hellip ) characters long, plus '&' and ';'. */ + i = -3; + } + /* Check for closing tag. */ + else if (ch == '<') { + word[i] = ch; + ++i; + word[i] = '\0'; + hypword(word, i, in, out, dict); + i = 0; + readtag(word, in, out); + if (strcmp(word, term) == 0) break; + } + else { + word[i] = ch; + ++i; + } + if (i == MAXWLEN - 1) { + word[i] = '\0'; + hypword(word, i, in, out, dict); + i = 0; + } + } +} + +/* Hyphenate HTML input via `­'. +hyp [in] [out] */ +int +main(int argc, char **argv) +{ + FILE *in; + if (argc < 2) + in = stdin; + else { + in = fopen(argv[1], "r"); + if (in == NULL) { + printf("%s %s\n", argv[1], "inaccessible."); + return 1; + } + } + + FILE *out; + if (argc < 3) + out = stdout; + else { + out = fopen(argv[2], "w"); + if (out == NULL) { + printf("%s %s\n", argv[2], "inaccessible."); + return 2; + } + } + + if (findbody(in, out) == 0) { + puts("There is no body."); + return 3; + } + + HyphenDict *dict = hnj_hyphen_load(dictfile); + if (dict == NULL) { + puts("Dict not readable."); + return 4; + } + dict->utf8 = 1; + + int tagamt = 0; + while (taglist[tagamt][0] != '\0') + ++tagamt; + int skiplen = strlen(skip); + + char ch, tag[MAXWLEN]; + int len; + while ((ch = fgetc(in)) != EOF) { + fputc(ch, out); + if (ch == '<' && (len = readtag(tag, in, out)) > 0 + && checktag(tag, tagamt, in, out) != -1) + hyptag(in, out, skiplen, tag, dict); + } + return 0; +} |