First working version of Nagari converter

Pierre Abbat Sun, 13 Feb 2000 08:31:37 -0600 (CST)

The attached program correctly converts Nagari letters to graphemes, as long as
there are no consonant clusters or short i's. The sample sentence is a Gujarati
palindrome.

phma

#include <stdio.h>

/* Conversion routines between Nagari (biased to Gujarati) letters and printed characters.
   The characters 0000-007f are Unicode minus an offset that depends on
   the particular Nagari script.
   By Pierre Abbat. Preliminary version 2000-02-13 */

#define NAGCHAR unsigned short

#define CANDRABINDU 0x01
#define ANUSVARA 0x02
#define VISARGA 0x03
#define A 0x05
#define AA 0x06
#define I 0x07
#define II 0x08
#define U 0x09
#define UU 0x0a
#define RI 0x0b
#define LI 0x0c
#define E 0x0f
#define AI 0x10
#define O 0x13
#define AU 0x14
#define KA 0x15
#define KHA 0x16
#define GA 0x17
#define GHA 0x18
#define NGA 0x19
#define CA 0x1a
#define CHA 0x1b
#define JA 0x1c
#define JHA 0x1d
#define NYA 0x1e
#define TTA 0x1f
#define TTHA 0x20
#define DDA 0x21
#define DDHA 0x22
#define NNA 0x23
#define TA 0x24
#define THA 0x25
#define DA 0x26
#define DHA 0x27
#define NA 0x28
#define NNNA 0x29
#define PA 0x2a
#define PHA 0x2b
#define BA 0x2c
#define BHA 0x2d
#define MA 0x2e
#define YA 0x2f
#define RA 0x30
#define LA 0x32
#define LLA 0x33
#define VA 0x35
#define SHA 0x36
#define SSA 0x37
#define SA 0x38
#define HA 0x39
#define VAA 0x3e
#define VI 0x3f
#define VII 0x40
#define VU 0x41
#define VUU 0x42
#define VR 0x43
#define VRR 0x44
#define VE 0x47
#define VAI 0x48
#define VO 0x4b
#define VAU 0x4c
#define VIRAMA 0x4d
#define RRI 0x60
#define VL 0x62

#define BLANK 0x80
#define JAA 0x81
#define JII 0x82
#define JO 0x83
#define JAU 0x84
#define RUU 0x85
#define K 0x95
#define KH 0x96
#define G 0x97
#define GH 0x98
#define NG 0x99
#define C 0x9
#define CH 0x9b
#define J 0x9c
#define JH 0x9d
#define NY 0x9e
#define TT 0x9f
#define TTH 0xa0
#define DD 0xa1
#define DDH 0xa2
#define NN 0xa3
#define T 0xa4
#define TH 0xa5
#define D 0xa6
#define DH 0xa7
#define N 0xa8
#define P 0xa
#define PH 0xab
#define B 0xac
#define BH 0xad
#define M 0xae
#define Y 0xaf
#define R 0xb0
#define L 0xb2
#define LL 0xb3
#define V 0xb5
#define SH 0xb6
#define SS 0xb7
#define S 0xb8
#define H 0xb9

/* Assignment of code points to ligatures is based on the first letter:
   100-11f velar                                     9
   120-12f palatal                                   4
   130-13f retroflex                                 4
   140-15f dental                                   10
   160-16b bilabial                                  6
   16c-16f semivowel                                 2
   170-16f fricative                                 8
   <100    with vowels
   The numbers 100-17f are for ligatures including a vowel (including Gujarati JI);
   180-1ff are for ligatures with no vowel (some of which are unprintable).
   The number 2 is used to denote doubling, as retroflex consonants are
   denoted by symbols such as DD.
   */

#define K2A	0x100
#define KTA	0x101
#define KRA	0x102
#define KSSA	0x103
#define NGKA	0x104
#define NGKHA	0x105
#define NGGA	0x106
#define NGGHA	0x107
#define NGNGA	0x108
#define NGMA	0x109

#define CHRA	0x120
#define JNYA	0x121

#define TT2A	0x130
#define TTRA	0x131
#define TTHRA	0x132
#define DDRA	0x133
#define DDHRA	0x134

#define T2A	0x140
#define TRA	0x141
#define DGA	0x142
#define D2A	0x143
#define D2HA	0x144
#define DBA	0x145
#define DRA	0x146
#define DVA	0x147

#define PHRA	0x160

#define SHVA	0x170
#define SSTTA	0x171
#define SSTTHA	0x172
#define STRA	0x173

#define K2	0x180
#define KT	0x181
#define KR	0x182
#define KSS	0x183
#define NGK	0x184
#define NGKH	0x185
#define NGG	0x186
#define NGGH	0x187
#define NGNG	0x188
#define NGM	0x189

#define CHR	0x1a0
#define JNY	0x1a1

#define TT2	0x1b0
#define TTR	0x1b1
#define TTHR	0x1b2
#define DDR	0x1b3
#define DDHR	0x1b4

#define T2	0x1c0
#define TR	0x1c1
#define DG	0x1c2
#define D2	0x1c3
#define D2H	0x1c4
#define DB	0x1c5
#define DR	0x1c6
#define DV	0x1c7

#define PHR	0x1e0

#define SHV	0x1f0
#define SSTT	0x1f1
#define SSTTH	0x1f2
#define STR	0x1f3

/* Metacharacters:
   ff00-ff07 match characters with flags 0-7 set respectively.
   ff08-ff1f repeat previous. Character is stored in a register.
   ff20-ff3f same as ff00-ff1f xored with 80, used on rhs of rules.
   ff70-ff8f separate lhs from rhs. The cursor is moved n-ff80 positions.

   */

#define FSV0	0xff00
#define CHL0	0xff01
#define BAC0	0xff02
#define VCF0	0xff03
#define NCV0	0xff04
#define CV0	0xff05
#define FSV1	0xff08
#define CHL1	0xff09
#define BAC1	0xff0a
#define VCF1	0xff0b
#define NCV1	0xff0c
#define CV1	0xff0d
#define FSV2	0xff10
#define CHL2	0xff11
#define BAC2	0xff12
#define VCF2	0xff13
#define NCV2	0xff14
#define CV2	0xff15
#define FSV3	0xff18
#define CHL3	0xff19
#define BAC3	0xff1a
#define VCF3	0xff1b
#define NCV3	0xff1c
#define CV3	0xff1d
#define FSV0R	0xff20
#define CHL0R	0xff21
#define BAC0R	0xff22
#define VCF0R	0xff23
#define NCV0R	0xff24
#define CV0R	0xff25
#define FSV1R	0xff28
#define CHL1R	0xff29
#define BAC1R	0xff2a
#define VCF1R	0xff2b
#define NCV1R	0xff2c
#define CV1R	0xff2d
#define FSV2R	0xff30
#define CHL2R	0xff31
#define BAC2R	0xff32
#define VCF2R	0xff33
#define NCV2R	0xff34
#define CV2R	0xff35
#define FSV3R	0xff38
#define CHL3R	0xff39
#define BAC3R	0xff3a
#define VCF3R	0xff3b
#define NCV3R	0xff3c
#define CV3R	0xff3d
#define YIELDS	0xff80
#define ANY	0xffff

/* Meanings of the character flags:
   01 freestanding vowel or consonant-vowel or cluster-vowel combination.
   02 consonant half-letter.
   04 bare adandic consonant, which must be viramified.
   08 vowel combining form or virama.
   10 non-consonant-vowel, before which a bare consonant (even if dandic) must be viramified.
   20 consonant-vowel
   */

#define FSV 1
#define CHL 2
#define BAC 4
#define VCF 8
#define NCV 16
#define CV 32

unsigned char charflags[]={
NCV,					// null character ends string, so must viramify
NCV,NCV,NCV,				// candrabindu, anusvara,visarga
NCV,					// unassigned
FSV,FSV,FSV,FSV,			// A AA I II
FSV,FSV,FSV,FSV,			// U UU RI LI
FSV,FSV,FSV,FSV,			// ? ? E AI
FSV,FSV,FSV,FSV,			// O ? ? AU
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// KA KHA GA GHA NGA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// CA CHA JA JHA NYA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// TTA TTHA DDA DDHA NNA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// TA THA DA DHA NA
CV+FSV,					// NNNA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// PA PHA BA BHA MA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV,	// YA RA RRA LA LLA
CV+FSV,CV+FSV,				// LLLA WA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,		// SHA SSA SA HA
NCV,NCV,NCV,NCV,			// ? ? NUKTA AVAGRAHA
VCF,VCF,VCF,VCF,			// AA I II U
VCF,VCF,VCF,VCF,			// UU RI RRI ?
VCF,VCF,VCF,VCF,			// ? E AI ?
VCF,VCF,VCF,VCF,			// ? O AU VIRAMA
NCV,NCV,NCV,				// unassigned unassigned OM
NCV,NCV,NCV,NCV,			// accents
NCV,NCV,NCV,				// unassigned
CV+FSV,CV+FSV,CV+FSV,CV+FSV,		// KA KHA GA JA with dot
CV+FSV,CV+FSV,CV+FSV,CV+FSV,		// DDA DDHA PHA PA with dot
FSV,FSV,VCF,VCF,			// RRI LLI LI LLI
NCV,NCV,NCV,NCV,			// DANDA DVIDANDA 0 1
NCV,NCV,NCV,NCV,			// 2 3 4 5
NCV,NCV,NCV,NCV,			// 6 7 8 9
NCV,NCV,NCV,NCV,			// abbreviation 3*unassigned
NCV,NCV,NCV,NCV,			// unassigned (in Bengali assigned to currency symbols)
NCV,NCV,NCV,NCV,			// unassigned
NCV,NCV,NCV,NCV,			// unassigned. Here ends the Unicode page.
NCV,CV+FSV,CV+FSV,CV+FSV,		// blank and special CV ligatures
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
NCV,NCV,NCV,NCV,NCV,
CHL,CHL,CHL,CHL,CHL+BAC,		// K KH G GH NG
CHL,CHL+BAC,CHL,CHL,CHL,		// C CH J JH NY
CHL+BAC,CHL+BAC,CHL+BAC,CHL+BAC,CHL,	// TT TTH DD DDH NN
CHL,CHL,CHL+BAC,CHL,CHL,		// T TH D DH N
CHL,					// NNN
CHL,CHL+BAC,CHL,CHL,CHL,		// P PH B BH M
CHL,CHL+BAC,CHL+BAC,CHL,CHL,		// Y R RR L LL
CHL,CHL,				// LLL W
CHL,CHL,CHL,CHL+BAC,			// SH SS S H
0};

NAGCHAR rules[]={
CHL0,A,YIELDS,CHL0R,0,
CHL0,AA,YIELDS,CHL0R,VAA,0,
CHL0,I,YIELDS,CHL0R,VI,0,
CHL0,II,YIELDS,CHL0R,VII,0,
CHL0,U,YIELDS,CHL0R,VU,0,
CHL0,UU,YIELDS,CHL0R,VUU,0,
CHL0,RI,YIELDS,CHL0R,VR,0,
CHL0,RRI,YIELDS,CHL0R,VRR,0,
CHL0,E,YIELDS,CHL0R,VE,0,
CHL0,AI,YIELDS,CHL0R,VAI,0,
CHL0,O,YIELDS,CHL0R,VO,0,
CHL0,AU,YIELDS,CHL0R,VAU,0,
CHL0,NCV0,YIELDS,CHL0R,VIRAMA,NCV0,0,
ANY,YIELDS+1,ANY,0, // this rule MUST be last; it advances the pointer
0};

NAGCHAR reg[64],any;

int match1(NAGCHAR ch,NAGCHAR pat)
/* Returns true if character ch matches pattern pat.
   If pat is a wildcard, the corresponding register in reg is set. */
{if (pat==ANY)
    {any=ch;
     return 1;
     }
 if (pat>=FSV0 && pat<FSV0R && (charflags[ch]&(1<<(pat&7))))
    {reg[pat-FSV0]=ch;
     reg[pat-FSV0+32]=ch^128; // the version with or without danda
     return 1;
     }
 return ch==pat;
 }

#define isyield(x) ((x)>=YIELDS-16 && (x)<YIELDS+16)
#define iswild(x) ((x)>=FSV0 && (x)<FSV0+64)

int match(NAGCHAR *str,NAGCHAR *rule)
/* Returns the length of the matched substring if the string matches the lhs of the rule. */
{int m,l;
 for (m=1,l=0;m && (!isyield(*rule));rule++,str++,l++)
     m=match1(*str,*rule);
 return m*l;
 }

void subst(NAGCHAR *str,NAGCHAR *rule,int len)
/* Substitutes rule (which is the rhs) for the first len characters of str. */
{int newlen, strlen;
 for (newlen=0;rule[newlen];newlen++);
 if (len!=newlen)
    {for (strlen=0;str[strlen];strlen++);
     memmove(str+newlen,str+len,(strlen-len+1)*sizeof(NAGCHAR));
     }
 for (;*rule;rule++,str++)
     if (iswild(*rule))
        *str=reg[*rule-FSV0];
     else
        if (*rule == ANY)
           *str=any;
        else
           *str=*rule;
 }

int apply(NAGCHAR **str,NAGCHAR *rule)
/* Returns true, and applies the rule, if the rule applies.
   Also adjusts str as specified in the rule.
   Returns false if rule does not apply. */
{int len;
 len=match(*str,rule);
 if (len)
    {for (;!isyield(*rule);rule++);
     subst(*str,rule+1,len);
     *str+=*rule-YIELDS;
     }
 return len;
 }

NAGCHAR *next(NAGCHAR *rule)
{for (;*rule;rule++);
 rule++;
 return rule;
 }

int tryallrules(NAGCHAR **str)
{NAGCHAR *rule;
 for (rule=rules;*rule;rule=next(rule))
     if (apply(str,rule))
        break;
 return *rule;
 }

int nagarize(NAGCHAR *str)
/* Turns a sequence of letters into nagari graphemes.
   There is NO bounds-checking. The string is converted in place
   and must be long enough to hold the result. */
{while (*str && tryallrules(&str));
 }

int main(int argc, char **argv)
{int i;
 NAGCHAR limadi[]= /* The train arrives in Limadi station. */
 {L,II,M,A,D,II,BLANK,G,AA,M,E,BLANK,G,AA,D,II,BLANK,M,A,L,II,0,0,0,0,0,0,0,0,0};
 for (i=0;limadi[i];i++)
     printf("%4x",limadi[i]);
 nagarize(limadi);
 putchar('\n');
 for (i=0;limadi[i];i++)
     printf("%4x",limadi[i]);
 putchar('\n');
 }

First working version of Nagari converter

Reply via email to