summaryrefslogtreecommitdiff
path: root/win/hyp.c
diff options
context:
space:
mode:
authorkaa <kaa@laptosh.my.domain>2023-06-23 11:56:56 -0700
committerkaa <kaa@laptosh.my.domain>2023-06-23 11:56:56 -0700
commit09819bcd940492c8ccc48284880f8bc652a2845a (patch)
tree89f5425d6907b111afd74142866440fba9bced0c /win/hyp.c
Initial.
Diffstat (limited to 'win/hyp.c')
-rw-r--r--win/hyp.c262
1 files changed, 262 insertions, 0 deletions
diff --git a/win/hyp.c b/win/hyp.c
new file mode 100644
index 0000000..697fed5
--- /dev/null
+++ b/win/hyp.c
@@ -0,0 +1,262 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <hyphen.h>
+#include "config.h"
+
+/* Read a tag into a character array and return its length. */
+int
+readtag(char *tag, FILE *in, FILE *out)
+{
+ char ch;
+ int i = 0;
+
+ /* Data after a space in a tag is irrelevant. */
+ while ((ch = fgetc(in)) != '>' && ch != ' ') {
+ fputc(ch, out);
+ tag[i] = ch;
+ ++i;
+ }
+
+ if (ch == ' ') {
+ /* Seek to the end of the tag. */
+ while (ch != '>'
+ && ch != EOF) {
+ fputc(ch, out);
+ ch = fgetc(in);
+ }
+ if (ch == EOF) {
+ return -1;
+ }
+ fputc(ch, out);
+ }
+ else
+ fputc(ch, out);
+
+ tag[i] = '\0';
+ return i;
+}
+
+int
+cmptag(char *tag2, FILE *in, FILE *out)
+{
+ char tag1[MAXWLEN];
+ readtag(tag1, in, out);
+
+ return strcmp(tag1, tag2);
+}
+
+int
+checktag(char *tag, int tagamt, FILE *in, FILE *out)
+{
+ int i;
+ for (i = 0; i < tagamt; ++i)
+ if (strcmp(tag, taglist[i]) == 0)
+ return i;
+ return -1;
+}
+
+/* Check if a character should be skipped. */
+int
+checkskip(char ch, int skiplen)
+{
+ int i;
+ for (i = 0; i < skiplen; ++i)
+ if (ch == skip[i])
+ return 1;
+
+ return 0;
+}
+
+const char *punct = "';.,\"!?:";
+/* Check if a character is punctuation. */
+int
+checkpunct(char ch)
+{
+ int i;
+ for (i = 0; punct[i] != '\0'; ++i)
+ if (ch == punct[i])
+ return 1;
+
+ return 0;
+}
+
+const char *blank = " \n\r\t";
+/* Check if a character is a blank. */
+int
+checkblank(char ch)
+{
+ int i;
+ for (i = 0; blank[i] != '\0'; ++i)
+ if (ch == blank[i])
+ return 1;
+
+ return 0;
+}
+
+/* Loop until the body is found. */
+int
+findbody(FILE *in, FILE *out)
+{
+ char ch;
+ while ((ch = fgetc(in)) != EOF) {
+ fputc(ch, out);
+ if (ch == '<'
+ && cmptag("body", in, out) == 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Hyphenate a word, by means of hyphen library.
+This is done so as to leverage sufficient hyphenation
+patterns, with the ones used here having been taken
+from those developed for TeX. */
+void
+hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict)
+{
+ if (len < MINWLEN) {
+ fprintf(out, "%s", word);
+ return;
+ }
+
+ char *hyphens = calloc(len + 6, sizeof(char));
+ char *hyphword = calloc(len << 1, sizeof(char));
+ char **rep = NULL;
+ int *pos = NULL, *cut = NULL;
+ hnj_hyphen_hyphenate2(dict, word, len, hyphens,
+ hyphword, &rep, &pos, &cut);
+
+/* fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */
+
+ /* Process the given hyphenation. */
+ int i;
+ char oldch = ' ', oldoldch = '\0';
+ for (i = 0; i < strlen(hyphword); ++i) {
+ if (hyphword[i] == '=' && oldch != ' '
+ && oldoldch != ' ')
+ fputs("&shy;", out);
+ else if (hyphword[i] != '=')
+ fputc(hyphword[i], out);
+
+ oldoldch = oldch;
+ oldch = hyphword[i];
+ }
+
+ free(hyphens);
+ free(hyphword);
+}
+
+/* Hyphenate the words within a tag. */
+void
+hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict)
+{
+ char ch, word[MAXWLEN], term[MAXWLEN] = "/";
+ int i = 0;
+
+ strcat(term, tag);
+
+ while ((ch = fgetc(in)) != EOF) {
+ if (i < 0) {
+ fputc(ch, out);
+ ++i;
+ if (checkskip(ch, skiplen)) i -= 3;
+ continue;
+ }
+ if (checkblank(ch)) {
+ word[i] = '\0';
+ hypword(word, i, in, out, dict);
+ i = 0;
+ }
+ if (checkpunct(ch)) {
+ word[i] = '\0';
+ hypword(word, i, in, out, dict);
+ fputc(ch, out);
+ i = 0;
+ }
+ else if (checkskip(ch, skiplen)) {
+ word[i] = '\0';
+ fputs(word, out);
+ fputc(ch, out);
+ /* A simple way of working around
+ HTML character codes. Each is 5 ( epsiv )
+ or 6 ( hellip ) characters long, plus '&' and ';'. */
+ i = -3;
+ }
+ /* Check for closing tag. */
+ else if (ch == '<') {
+ word[i] = ch;
+ ++i;
+ word[i] = '\0';
+ hypword(word, i, in, out, dict);
+ i = 0;
+ readtag(word, in, out);
+ if (strcmp(word, term) == 0) break;
+ }
+ else {
+ word[i] = ch;
+ ++i;
+ }
+ if (i == MAXWLEN - 1) {
+ word[i] = '\0';
+ hypword(word, i, in, out, dict);
+ i = 0;
+ }
+ }
+}
+
+/* Hyphenate HTML input via `&shy;'.
+hyp [in] [out] */
+int
+main(int argc, char **argv)
+{
+ FILE *in;
+ if (argc < 2)
+ in = stdin;
+ else {
+ in = fopen(argv[1], "r");
+ if (in == NULL) {
+ printf("%s %s\n", argv[1], "inaccessible.");
+ return 1;
+ }
+ }
+
+ FILE *out;
+ if (argc < 3)
+ out = stdout;
+ else {
+ out = fopen(argv[2], "w");
+ if (out == NULL) {
+ printf("%s %s\n", argv[2], "inaccessible.");
+ return 2;
+ }
+ }
+
+ if (findbody(in, out) == 0) {
+ puts("There is no body.");
+ return 3;
+ }
+
+ HyphenDict *dict = hnj_hyphen_load(dictfile);
+ if (dict == NULL) {
+ puts("Dict not readable.");
+ return 4;
+ }
+ dict->utf8 = 1;
+
+ int tagamt = 0;
+ while (taglist[tagamt][0] != '\0')
+ ++tagamt;
+ int skiplen = strlen(skip);
+
+ char ch, tag[MAXWLEN];
+ int len;
+ while ((ch = fgetc(in)) != EOF) {
+ fputc(ch, out);
+ if (ch == '<' && (len = readtag(tag, in, out)) > 0
+ && checktag(tag, tagamt, in, out) != -1)
+ hyptag(in, out, skiplen, tag, dict);
+ }
+ return 0;
+}