The attached program correctly converts Nagari letters to graphemes, as long as
there are no consonant clusters or short i's. The sample sentence is a Gujarati
palindrome.
phma
#include <stdio.h>
/* Conversion routines between Nagari (biased to Gujarati) letters and printed characters.
The characters 0000-007f are Unicode minus an offset that depends on
the particular Nagari script.
By Pierre Abbat. Preliminary version 2000-02-13 */
#define NAGCHAR unsigned short
#define CANDRABINDU 0x01
#define ANUSVARA 0x02
#define VISARGA 0x03
#define A 0x05
#define AA 0x06
#define I 0x07
#define II 0x08
#define U 0x09
#define UU 0x0a
#define RI 0x0b
#define LI 0x0c
#define E 0x0f
#define AI 0x10
#define O 0x13
#define AU 0x14
#define KA 0x15
#define KHA 0x16
#define GA 0x17
#define GHA 0x18
#define NGA 0x19
#define CA 0x1a
#define CHA 0x1b
#define JA 0x1c
#define JHA 0x1d
#define NYA 0x1e
#define TTA 0x1f
#define TTHA 0x20
#define DDA 0x21
#define DDHA 0x22
#define NNA 0x23
#define TA 0x24
#define THA 0x25
#define DA 0x26
#define DHA 0x27
#define NA 0x28
#define NNNA 0x29
#define PA 0x2a
#define PHA 0x2b
#define BA 0x2c
#define BHA 0x2d
#define MA 0x2e
#define YA 0x2f
#define RA 0x30
#define LA 0x32
#define LLA 0x33
#define VA 0x35
#define SHA 0x36
#define SSA 0x37
#define SA 0x38
#define HA 0x39
#define VAA 0x3e
#define VI 0x3f
#define VII 0x40
#define VU 0x41
#define VUU 0x42
#define VR 0x43
#define VRR 0x44
#define VE 0x47
#define VAI 0x48
#define VO 0x4b
#define VAU 0x4c
#define VIRAMA 0x4d
#define RRI 0x60
#define VL 0x62
#define BLANK 0x80
#define JAA 0x81
#define JII 0x82
#define JO 0x83
#define JAU 0x84
#define RUU 0x85
#define K 0x95
#define KH 0x96
#define G 0x97
#define GH 0x98
#define NG 0x99
#define C 0x9
#define CH 0x9b
#define J 0x9c
#define JH 0x9d
#define NY 0x9e
#define TT 0x9f
#define TTH 0xa0
#define DD 0xa1
#define DDH 0xa2
#define NN 0xa3
#define T 0xa4
#define TH 0xa5
#define D 0xa6
#define DH 0xa7
#define N 0xa8
#define P 0xa
#define PH 0xab
#define B 0xac
#define BH 0xad
#define M 0xae
#define Y 0xaf
#define R 0xb0
#define L 0xb2
#define LL 0xb3
#define V 0xb5
#define SH 0xb6
#define SS 0xb7
#define S 0xb8
#define H 0xb9
/* Assignment of code points to ligatures is based on the first letter:
100-11f velar 9
120-12f palatal 4
130-13f retroflex 4
140-15f dental 10
160-16b bilabial 6
16c-16f semivowel 2
170-16f fricative 8
<100 with vowels
The numbers 100-17f are for ligatures including a vowel (including Gujarati JI);
180-1ff are for ligatures with no vowel (some of which are unprintable).
The number 2 is used to denote doubling, as retroflex consonants are
denoted by symbols such as DD.
*/
#define K2A 0x100
#define KTA 0x101
#define KRA 0x102
#define KSSA 0x103
#define NGKA 0x104
#define NGKHA 0x105
#define NGGA 0x106
#define NGGHA 0x107
#define NGNGA 0x108
#define NGMA 0x109
#define CHRA 0x120
#define JNYA 0x121
#define TT2A 0x130
#define TTRA 0x131
#define TTHRA 0x132
#define DDRA 0x133
#define DDHRA 0x134
#define T2A 0x140
#define TRA 0x141
#define DGA 0x142
#define D2A 0x143
#define D2HA 0x144
#define DBA 0x145
#define DRA 0x146
#define DVA 0x147
#define PHRA 0x160
#define SHVA 0x170
#define SSTTA 0x171
#define SSTTHA 0x172
#define STRA 0x173
#define K2 0x180
#define KT 0x181
#define KR 0x182
#define KSS 0x183
#define NGK 0x184
#define NGKH 0x185
#define NGG 0x186
#define NGGH 0x187
#define NGNG 0x188
#define NGM 0x189
#define CHR 0x1a0
#define JNY 0x1a1
#define TT2 0x1b0
#define TTR 0x1b1
#define TTHR 0x1b2
#define DDR 0x1b3
#define DDHR 0x1b4
#define T2 0x1c0
#define TR 0x1c1
#define DG 0x1c2
#define D2 0x1c3
#define D2H 0x1c4
#define DB 0x1c5
#define DR 0x1c6
#define DV 0x1c7
#define PHR 0x1e0
#define SHV 0x1f0
#define SSTT 0x1f1
#define SSTTH 0x1f2
#define STR 0x1f3
/* Metacharacters:
ff00-ff07 match characters with flags 0-7 set respectively.
ff08-ff1f repeat previous. Character is stored in a register.
ff20-ff3f same as ff00-ff1f xored with 80, used on rhs of rules.
ff70-ff8f separate lhs from rhs. The cursor is moved n-ff80 positions.
*/
#define FSV0 0xff00
#define CHL0 0xff01
#define BAC0 0xff02
#define VCF0 0xff03
#define NCV0 0xff04
#define CV0 0xff05
#define FSV1 0xff08
#define CHL1 0xff09
#define BAC1 0xff0a
#define VCF1 0xff0b
#define NCV1 0xff0c
#define CV1 0xff0d
#define FSV2 0xff10
#define CHL2 0xff11
#define BAC2 0xff12
#define VCF2 0xff13
#define NCV2 0xff14
#define CV2 0xff15
#define FSV3 0xff18
#define CHL3 0xff19
#define BAC3 0xff1a
#define VCF3 0xff1b
#define NCV3 0xff1c
#define CV3 0xff1d
#define FSV0R 0xff20
#define CHL0R 0xff21
#define BAC0R 0xff22
#define VCF0R 0xff23
#define NCV0R 0xff24
#define CV0R 0xff25
#define FSV1R 0xff28
#define CHL1R 0xff29
#define BAC1R 0xff2a
#define VCF1R 0xff2b
#define NCV1R 0xff2c
#define CV1R 0xff2d
#define FSV2R 0xff30
#define CHL2R 0xff31
#define BAC2R 0xff32
#define VCF2R 0xff33
#define NCV2R 0xff34
#define CV2R 0xff35
#define FSV3R 0xff38
#define CHL3R 0xff39
#define BAC3R 0xff3a
#define VCF3R 0xff3b
#define NCV3R 0xff3c
#define CV3R 0xff3d
#define YIELDS 0xff80
#define ANY 0xffff
/* Meanings of the character flags:
01 freestanding vowel or consonant-vowel or cluster-vowel combination.
02 consonant half-letter.
04 bare adandic consonant, which must be viramified.
08 vowel combining form or virama.
10 non-consonant-vowel, before which a bare consonant (even if dandic) must be viramified.
20 consonant-vowel
*/
#define FSV 1
#define CHL 2
#define BAC 4
#define VCF 8
#define NCV 16
#define CV 32
unsigned char charflags[]={
NCV, // null character ends string, so must viramify
NCV,NCV,NCV, // candrabindu, anusvara,visarga
NCV, // unassigned
FSV,FSV,FSV,FSV, // A AA I II
FSV,FSV,FSV,FSV, // U UU RI LI
FSV,FSV,FSV,FSV, // ? ? E AI
FSV,FSV,FSV,FSV, // O ? ? AU
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // KA KHA GA GHA NGA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // CA CHA JA JHA NYA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // TTA TTHA DDA DDHA NNA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // TA THA DA DHA NA
CV+FSV, // NNNA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // PA PHA BA BHA MA
CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // YA RA RRA LA LLA
CV+FSV,CV+FSV, // LLLA WA
CV+FSV,CV+FSV,CV+FSV,CV+FSV, // SHA SSA SA HA
NCV,NCV,NCV,NCV, // ? ? NUKTA AVAGRAHA
VCF,VCF,VCF,VCF, // AA I II U
VCF,VCF,VCF,VCF, // UU RI RRI ?
VCF,VCF,VCF,VCF, // ? E AI ?
VCF,VCF,VCF,VCF, // ? O AU VIRAMA
NCV,NCV,NCV, // unassigned unassigned OM
NCV,NCV,NCV,NCV, // accents
NCV,NCV,NCV, // unassigned
CV+FSV,CV+FSV,CV+FSV,CV+FSV, // KA KHA GA JA with dot
CV+FSV,CV+FSV,CV+FSV,CV+FSV, // DDA DDHA PHA PA with dot
FSV,FSV,VCF,VCF, // RRI LLI LI LLI
NCV,NCV,NCV,NCV, // DANDA DVIDANDA 0 1
NCV,NCV,NCV,NCV, // 2 3 4 5
NCV,NCV,NCV,NCV, // 6 7 8 9
NCV,NCV,NCV,NCV, // abbreviation 3*unassigned
NCV,NCV,NCV,NCV, // unassigned (in Bengali assigned to currency symbols)
NCV,NCV,NCV,NCV, // unassigned
NCV,NCV,NCV,NCV, // unassigned. Here ends the Unicode page.
NCV,CV+FSV,CV+FSV,CV+FSV, // blank and special CV ligatures
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
CV+FSV,CV+FSV,CV+FSV,CV+FSV,
NCV,NCV,NCV,NCV,NCV,
CHL,CHL,CHL,CHL,CHL+BAC, // K KH G GH NG
CHL,CHL+BAC,CHL,CHL,CHL, // C CH J JH NY
CHL+BAC,CHL+BAC,CHL+BAC,CHL+BAC,CHL, // TT TTH DD DDH NN
CHL,CHL,CHL+BAC,CHL,CHL, // T TH D DH N
CHL, // NNN
CHL,CHL+BAC,CHL,CHL,CHL, // P PH B BH M
CHL,CHL+BAC,CHL+BAC,CHL,CHL, // Y R RR L LL
CHL,CHL, // LLL W
CHL,CHL,CHL,CHL+BAC, // SH SS S H
0};
NAGCHAR rules[]={
CHL0,A,YIELDS,CHL0R,0,
CHL0,AA,YIELDS,CHL0R,VAA,0,
CHL0,I,YIELDS,CHL0R,VI,0,
CHL0,II,YIELDS,CHL0R,VII,0,
CHL0,U,YIELDS,CHL0R,VU,0,
CHL0,UU,YIELDS,CHL0R,VUU,0,
CHL0,RI,YIELDS,CHL0R,VR,0,
CHL0,RRI,YIELDS,CHL0R,VRR,0,
CHL0,E,YIELDS,CHL0R,VE,0,
CHL0,AI,YIELDS,CHL0R,VAI,0,
CHL0,O,YIELDS,CHL0R,VO,0,
CHL0,AU,YIELDS,CHL0R,VAU,0,
CHL0,NCV0,YIELDS,CHL0R,VIRAMA,NCV0,0,
ANY,YIELDS+1,ANY,0, // this rule MUST be last; it advances the pointer
0};
NAGCHAR reg[64],any;
int match1(NAGCHAR ch,NAGCHAR pat)
/* Returns true if character ch matches pattern pat.
If pat is a wildcard, the corresponding register in reg is set. */
{if (pat==ANY)
{any=ch;
return 1;
}
if (pat>=FSV0 && pat<FSV0R && (charflags[ch]&(1<<(pat&7))))
{reg[pat-FSV0]=ch;
reg[pat-FSV0+32]=ch^128; // the version with or without danda
return 1;
}
return ch==pat;
}
#define isyield(x) ((x)>=YIELDS-16 && (x)<YIELDS+16)
#define iswild(x) ((x)>=FSV0 && (x)<FSV0+64)
int match(NAGCHAR *str,NAGCHAR *rule)
/* Returns the length of the matched substring if the string matches the lhs of the rule. */
{int m,l;
for (m=1,l=0;m && (!isyield(*rule));rule++,str++,l++)
m=match1(*str,*rule);
return m*l;
}
void subst(NAGCHAR *str,NAGCHAR *rule,int len)
/* Substitutes rule (which is the rhs) for the first len characters of str. */
{int newlen, strlen;
for (newlen=0;rule[newlen];newlen++);
if (len!=newlen)
{for (strlen=0;str[strlen];strlen++);
memmove(str+newlen,str+len,(strlen-len+1)*sizeof(NAGCHAR));
}
for (;*rule;rule++,str++)
if (iswild(*rule))
*str=reg[*rule-FSV0];
else
if (*rule == ANY)
*str=any;
else
*str=*rule;
}
int apply(NAGCHAR **str,NAGCHAR *rule)
/* Returns true, and applies the rule, if the rule applies.
Also adjusts str as specified in the rule.
Returns false if rule does not apply. */
{int len;
len=match(*str,rule);
if (len)
{for (;!isyield(*rule);rule++);
subst(*str,rule+1,len);
*str+=*rule-YIELDS;
}
return len;
}
NAGCHAR *next(NAGCHAR *rule)
{for (;*rule;rule++);
rule++;
return rule;
}
int tryallrules(NAGCHAR **str)
{NAGCHAR *rule;
for (rule=rules;*rule;rule=next(rule))
if (apply(str,rule))
break;
return *rule;
}
int nagarize(NAGCHAR *str)
/* Turns a sequence of letters into nagari graphemes.
There is NO bounds-checking. The string is converted in place
and must be long enough to hold the result. */
{while (*str && tryallrules(&str));
}
int main(int argc, char **argv)
{int i;
NAGCHAR limadi[]= /* The train arrives in Limadi station. */
{L,II,M,A,D,II,BLANK,G,AA,M,E,BLANK,G,AA,D,II,BLANK,M,A,L,II,0,0,0,0,0,0,0,0,0};
for (i=0;limadi[i];i++)
printf("%4x",limadi[i]);
nagarize(limadi);
putchar('\n');
for (i=0;limadi[i];i++)
printf("%4x",limadi[i]);
putchar('\n');
}