/* PARSE.C Loglan parser top level / Yacc front end Copyright (C) 1982 by The Loglan Institute, Inc. Created 82.1.27 Gyro Largely modified by RAM Sep 88 to unify with Macintosh programs The "front end" part of this program reads preprocessed specimens from a file or the console, and hands the words to yyparse (the Yacc parser). It then displays or writes out the resulting parse.*/ #define global //Needs to be commented out for MACDOS and DOS. #include "extval.h" #include "yc.tab" void Init(void) { /* set up the world of LIP*/ wallp = FALSE; /* defaults are no output and linear parse*/ nconts = 0; /* number of nestings for separate parses*/ TreeInit(); } short Fixlex(short lex) /* On entry lex is a subscripted lexeme. Remove the subscript, and return an artificial lexeme value for the humanized parse.*/ { switch(lex) { case A1: case A2: case A3: case A4: return A; case GA2: return GA; case GE2: return GE; case KA1: case KA2: case KA3: return KA; case NO1: case NO2: case NO3: case NO4: return NO; case PA1: case PA2: return PA; case ZE2: return ZE; //ZE1 removed Jun 97 default: return lex; } } void Parse(void) /* parse an entry */ { short i,count,flag; TREE *yew; Acap = Aspace; icont = 0; #ifdef MACDOS /* This segment is for corpus parsing. DOS type LIP only */ if (!consp && !textf) /* get corpus entry's label */ IGetField (label); #endif Lex(); /* lex the specimen */ redo: for (i=0; i<10; ++i) fullprs[i] = actualprs[i] = humactprs[i] = 0; /* RAM: freemods are marked, counted in nconts, and then parsed separately. Each nest has lexemes multiplied by nconts*1000, so the lexemes or parse level must be recovered when required. */ for (icont = nconts-1; icont >= 0; --icont) { curword = 0; /* RAM Jul 87 This segment allows a partial parse to be output in interactive mode. In Corpus mode, it still outputs a * as before. Additional information includes the state number at failure. */ count = 0; for(i=0;inwords) flag=0; if (flag) { /* Replace ZE1 with ZE2 before argsign */ InsZe2(); /* Replace PA2 with PA1 before pred-sign */ InsM7(); /* Replace KA3 with KA2|KA1 before pred-sign|link */ InsM3M2(); InsM7(); /*again to permit PA before kekpreds*/ /* Replace NO4 with NO3|NO2|NO1 before argsign|markpred|PA */ InsM8M4(); /* Replace A4 with A1|A2|A3 before pred|linkargs|argmod respectively.*/ InsM10M1M6M5M9M11(); goto redo; } badparse = 1; /*Flag a bad parse*/ #ifdef DOS if (wallp==1) fprintf(obuf, "\nFailure at token %d and at state %d\n", curword,yystate); if ((consp || textf)) { nfputs ((unsigned char *)"\nTHIS EXPRESSION DOES NOT PARSE",0); if(consp)printf("\nFailure at token %d and at state %d\n", curword,yystate); if (textf || itextf ) fprintf(sbuf, "\nFailure at token %d and at state %d\n", curword,yystate); } #endif if ((consp || textf)) { /* RAM: 90 Failed parses are shown only if yystate is 1 (i.e. the parser detected a logical end before the actual end of the sentence, and at the lowest nesting level, and if all-at-once text processing is not in effect.*/ if (!icont&& yystate==1) { #ifdef DOS nfputs ((unsigned char *)"\nAn incomplete parse to the point of failure follows\n",0); #endif fullprs[icont] = yyold; /*This is the standard processing for a successful parse. See below*/ if ((yew=FixVoc(fullprs[icont])) !=NULL) fullprs[icont] = (TREE *)NewNode (yew, fullprs[icont]); actualprs[icont] = TreeFlatten (fullprs[icont]); humactprs[icont] = DelGUs (actualprs[icont]); } else for (i=0; in.ngrameme = "freemods"; fullprs[0] = ctree(NewNode(fullprs[1],fullprs[0])); fullprs[1] = 0; } /* RAN: 88 If there is only one parse, which has a number greater than 0, it is a freemod standing alone. Move it to level 0. */ else if (fullprs[1]) { fullprs[0] = fullprs[1]; fullprs[1] = 0; } /* RAM: 90 This routine deals with vocatives parsed as terms. They must be detached and moved to an appropriate place in the tree, and reattached to the tree at that point. This must be done before any tree stripping takes place*/ if ((yew=FixVoc(fullprs[0])) !=NULL) fullprs[0] = (TREE *)NewNode(yew,fullprs[0]); /* RAM 90 Trees contain a lot of 'branches' that do nøt branch, but reduce from one single grameme to another (unary reductions). For the 'actual parse', these are stripped out. The full tree is now available only in a hidden manner by JCB's order */ actualprs[0] = TreeFlatten (fullprs[0]); /* The humanized actual parse has all inserted right-hand ends removed */ humactprs[0] = DelGUs (actualprs[0]); nconts = 1; } void WriteRslt(short n) /* write/display resulting parse */ /*RAM 90 This is the main routine for writing a parse in the 'linear' form. n is a flag indicating if lexemes or Loglan words are to be written, and whether subscripted lexemes should be desubscripted */ { if (consp) WriteString (humactprs,n); #ifdef MACDOS else if (textf) WriteString (humactprs,n); /*text files for IBM*/ else { /*This section deals with writing corpus parses*/ fputs (label, obuf); putc ('\t', obuf); fputs ("\n\t", obuf); WriteString (actualprs,0); fputs ("\n\t", obuf); WriteString (humactprs,0); fputs ("\n", obuf); } #endif } void WriteString (TREE **slist,short n) /* write parse trees as strings */ /* RAM 90 Write the parse tree slist as a linear parenthesized string, using flag n to indicate if lexemes or Loglan words, and also if subscripted lexemes should be desubscripted for the humanized parse (n=2) */ { /*RAM 90 This part is largely vestigial from the times when multiple short sentences were permitted. As icont is now used for separate parsing of freemods, icont should always be 0 at this point */ if (slist[0]) { TreeString (slist[0],n); if (badparse) { /*bad parses are only written if state = 1*/ nfputs ((unsigned char *)"....?",0); nfputc (NL); } } else if (obuf) putc ('*', obuf); /*and never for corpus parsing*/ } void DspString (TREE **slist, Boolean reasonable) /* display trees */ /*RAM 90 Display slist as a tree, either a full tree with unary reductions, or a 'reasonable' tree with only gramemes with more than one component shown*/ { short icont; /*Vestigial separator removed. Vestigial doloop kept*/ for (icont = 0; icont < nconts; ++icont) if (slist[icont]) TreeDsp (slist[icont],reasonable); } TREE* DelGUs (TREE *oak) /* delete "invisible" punctuators */ /*RAM 90 some parsing errors are equivalent in the machine to right-end punctuators. When this is the case, an 'invisible' punctuator is inserted in the parse. These are only shown in the 'machine' parse, and deleted for the 'humanized' parse. oak is the (partial) tree structure being processed. The tree is 'flattened' at the same time by removing unary reductions */ { NODE *tnode; short ikid; TREE *ttree; /* RAM 90 Return the current tree structure if a leaf, and visible, else nothing*/ if (IsLeaf (oak)) return (oak->l.lsource != WINVISIBLE) ? oak : NULL; /* RAM 90 If a node build or add to a new tree which will be devoid of invisible punctuation */ tnode = NodeCopy (oak); tnode->nlength = 0; /* RAM 90 Recursively delete the invisible punctuation from each node branch, adding each node or leaf returned, and ignoring the 'invisible' leaves */ for (ikid = 0; ikid < oak->n.nlength; ++ikid) { ttree = DelGUs (oak->n.nkids[ikid]); if (ttree) tnode->nkids[tnode->nlength++] = ttree; } /* and deleting the unary reductions at the same time */ if (tnode->nlength > 1) return (ctree(tnode)); /* flatten as we go */ else if (tnode->nlength == 1) return (tnode->nkids[0]); else return (NULL); } void Lex(void) /* RAM 90 Lex the current specimen. Put each word in lower case and add it to a lexeme list. Words that may take non Loglan words following have those words 'gobbled' into the lexeme. These are LIE (used for quoting foreign words and phrases), SUE (for onomatopoeia), and LAO (for Linnaean names). The preparser is then called for further manipulation */ { char twd[WORDMAX]; //short iword; nwords = curword = addend = 0; while (IGetToken (twd)) { DnCase (twd); LexAddWd (twd); if (lexemes [nwords - 1]%1000 == LIE) LexLie(); if (lexemes [nwords - 1]%1000 == SUE) LexSue(); if (lexemes [nwords - 1]%1000 == LAO) LexLao(); } nconts = 1; PreParse(); } void LexAddWd (char *wd) /*RAM 90 add word wd to specimen. Find the lexeme with LexWord. RA, HU, ZO, and NOI are grammatically but not semantically identical to some other lexemes. They need to remain separate lexemes for the preparser, but may be lumped in with another lexeme for parsing. LWs if not in the lexeme list are returned with lexeme BAD, and are checked to see if they break into valid lexemes, which are then added as separate pieces. RAM 92. Modification were made to move all of the compound analysis to Preparse instead of having it partly here. */ { short i, tlex, nparts, partlex[15]; Boolean brkmap[WORDMAX]; /*indicates break before corresponding char in wd*/ char parts[15][5], *parptr[15], *cptr; tlex = LexWord (wd); /* RAM Jan 88. Removed Lexemes RA HU and ZO from the Trial.67 grammar. The preparser returns them and the code here changes them to the equivalent lexemes NI DA and PO */ switch(tlex) { case MA: case RA: tlex = NI; break; case MO: case HU: tlex = DA; break; case ZO: tlex = PO; break; case NOI: tlex = NI; break; case KOU: tlex = PA2; break; } if (tlex != BAD || strlen (wd) < 3) { LAddLex (wd, tlex); return; } /*RAM 92 All little words are now compounded. Break up the compound and lex the pieces. Then analyse which should remain compounded in LexOfCpd */ LexBreakCPD (wd, brkmap); /*Break up compounded little words*/ i = 0; nparts = 0; repeat { /*Isolate word into null terminated parts*/ cptr = parts[nparts++]; do *cptr++ = wd[i++]; while (wd[i] && !brkmap[i]); *cptr = NUL; if (!wd[i]) break; } /*Lex the individual parts*/ for (i = 0; i < nparts; ++i) { partlex[i] = LexWord (parts[i]); parptr[i] = &parts[i][0]; } /* RAM 92. Now examine the decompounded word to decide which parts should remain compounded */ LexOfCPD (parptr, partlex, nparts); } void LexLie(void) /* This routine chews up a 'lie' expression so we don't try to lex the words inside. This is the strong quotation which is followed by a LW, a non-Loglan expression and the same LW. The whole expression is considered an instance of LIE. */ { short iword; char twd[WORDMAX], liematch[WORDMAX]; iword = nwords; repeat { if (!IGetToken (twd)) { /*Get the first flag token*/ alert(2,4); break; } DnCase (twd); /*put it in lower case */ LAddLex (twd,0); /*Add it to a single string, unlexed*/ /* Copy the first word to liematch for matching purposes, else compare each word read with liematch until a match is found */ if (iword == nwords - 1) strcpy (liematch, twd); else if (match (twd, liematch)) break; } /* Combine all the words into one node, to be considered as a leaf called lie-exp and reduce the word-count appropriately.*/ leaves[iword] = cleaf(Node ("lie-exp", nwords - iword, (TREE**)&leaves[iword])); nwords = iword + 1; } void LexLao(void) /* This routine chews up a 'lao' expression so we don't try to lex the words inside. LAO is the Linnaean descriptor. It is virtually identical to the above, except that a comma indicates the end of the expression, and the pseudo-leaf is called Linnaean. */ { short iword; char twd[WORDMAX]; iword = nwords; repeat { if (!IGetToken (twd)) { break; } if (!strcmp(twd,",")) break; DnCase (twd); LAddLex (twd, 0); } leaves[iword] = cleaf(Node ("Foreign", nwords - iword,(TREE**)&leaves[iword])); nwords = iword + 1; } void LexSue(void) /* This routine chews up a 'sue' expression so we don't try to lex the words inside. SUE is the onamatopoetic descriptor. It is virtually identical to the above, A comma indicates the end of the expression, and the pseudo-leaf is called sue-exp. */ { short iword; char twd[WORDMAX]; iword = nwords; repeat { if (!IGetToken (twd)) { break; } if (!strcmp(twd,",")) break; DnCase (twd); LAddLex (twd, 0); } leaves[iword] = cleaf(Node ("sue-exp", nwords - iword,(TREE**)&leaves[iword])); nwords = iword + 1; } short LexWord (char *wd) /* Get the lexical class of (non-CPD) */ { short tlen, tlex; tlen = strlen (wd); /*All words ending in a consonant are in lexeme DJAN*/ if (IsCons (wd [tlen - 1])) return (DJAN); /* All other words with two adjacent consonants are PREDAs */ if (!CpdP (wd)) return (PREDA); /* All other words are looked up in a dictionary of words vs. lexemes*/ if ((tlex = LexLookup (wd)) >= 0) return (tlex); /* If not present, but of form consonant + certain endings, or vowel + certain other endings, they are TAI = lettorals */ if (MatchList (wd, "Vfi", "Vsi", "Cei","Ceo", NULL)) return (TAI); if (MatchList (wd, "Vma", "Cai", NULL)) return (TAI); /* Otherwise they are illegal or compounded LWs which must be examined separately */ return (BAD); } void LAddLex (char *wd, short lex) /* add a word to string */ /* This routine associates a word wd with a lexeme lex */ { if (lex == BAD) /*Notify the user if a word cannot be lexed*/ #ifdef DOS message(2,5,(unsigned char *)wd); #else message(2,5,CtoPstr(wd)); #endif /* Make a tree leaf of the word and its lexeme, and save the lexeme also in a list of lexemes */ leaves[nwords] = Leaf (wd,lex, WSTART); lexemes[nwords] = lex; if (++nwords > STRINGMAX) { /* Check for array overflow*/ alert(2,6); --nwords; } } void LexDelWd(short iword) /* Delete the word iword from the leaves and lexeme vector. Used for deleting non-lexemic pauses*/ { if (nwords < 1) { /*Do not delete if no words to delete*/ alert(2,7); return; } /*Remove a leaf and vector item, close up the space, and reduce the word count.*/ arydel ((char *)leaves, nwords, sizeof(*leaves), iword, iword + 1); arydel ((char *)lexemes, nwords, sizeof(*lexemes), iword, iword + 1); --nwords; } short LexLookup (char *wd) /* Look up word wd in the lexeme table */ { short lo, hi, try, result; lo = 0; hi = lextabsiz - 1; /* set up for binary search */ repeat { if (hi < lo) return (-1); /* failure */ try = (lo + hi) / 2; result = strcmp (lextable[try].lexword, wd); if (result == 0) return (lextable[try].lexval); /* success! */ if (result < 0) lo = try + 1; else hi = try - 1; } } void LexWrite (void) /* Write out list of the words with their lexemes. One of the display possibilities */ { short iword; for (iword = 0; iword < nwords; ++iword) { /*Write the word list*/ if (iword > 0) nfputc (' '); TreeList ((TREE *)leaves[iword]); } /* RAM Mar 88, write lexeme string as well as words */ #ifndef DOS nfputc(CR); for (iword = 0; iword < nwords; ++iword) { nfputs((unsigned char *)lexnames[lexemes[iword]%1000-257],0); nfputc(SP); if (wallp==1) fprintf(obuf,"%s ", lexnames[lexemes[iword]%1000-257]); } nfputc(CR); #else if (consp) { /*Added May 88 for corpus parse */ nfputc('\n'); nfputs((unsigned char *)" ",0); for (iword = 0; iword < nwords; ++iword) { printf("%s ",lexnames[lexemes[iword]%1000-257]); if (wallp==1) fprintf(obuf,"%s ", lexnames[lexemes[iword]%1000-257]); } } #endif } Boolean MatchList (char *wd, char *patlist,...) /* see if word wd matches any pattern contained in patlist. A patlist has upper case C to mean any consonant, and upper case V for any vowel, and ':' matches any letter, including end-of-word */ { char *tc, *tp; short ipat; for (ipat = 0; (&patlist)[ipat] != NULL; ++ipat) { if (ipat >= 20) { /*pat list too long*/ alert(2,8); return (FALSE); } tc = wd; tp = (&patlist)[ipat]; /*Examine an individual word from the patlist*/ while (*tc && *tp) { if (*tp == 'C') { if (!IsCons (*tc)) break; } /*No match*/ else if (*tp == 'V') { if (!IsVowel (*tc)) break; } else if (*tp == ':') return TRUE; else if (*tp != 'X' && *tp != *tc) break; ++tc; ++tp; } if (!*tc && (!*tp || *tp == ':')) return TRUE; } return (FALSE); } Boolean CpdP (char *sc) /* Check a word sc for two adjacent consonants */ { Boolean prevcons, thiscons; prevcons = thiscons = FALSE; while (*sc) { thiscons = IsCons (*sc++); if (prevcons && thiscons) return (FALSE); prevcons = thiscons; } return (!prevcons); } void DnCase (char *s) /* force a string s to lower case */ { while (*s) { *s = tolower (*s); ++s; } } Boolean IsCons (char c) /* test character c for consonant */ { return (isalpha (c) && !IsVowel (c)); } Boolean IsVowel (char c) /* test character c for vowel */ { switch (tolower (c)) { case 'a': case 'e': case 'i': case 'o': case 'u': return (TRUE); default: return (FALSE); } } /* End of Parse.c */