There are many rough corners to refine, but here's the first working
version.
First you need to create unitab.ijf file.
=======================================
require 'regex jfiles'
t=: 1!:1 <'temp\box\EastAsianWidth.txt'
point=:'^([0-9A-F]{4});(Na|N|H|A|W|F)' rxmatches t
range=:'([0-9A-F]{1,4})\.\.([0-9A-F]{1,4});(Na|N|H|A|W|F)' rxmatches t
pod=: }."1 point rxfrom t
rad=: }."1 range rxfrom t
widthcode=:;: 'N Na H A F W'
towc=: widthcode&i. NB. towidthcode
dfh=: 16&#. @ ('0123456789ABCDEF'&i.)
po=:(dfh each {."1 pod),. <"0 towc"0 {:"1 pod
ra=:(,&.>/"1 dfh each 2&{."1 rad),. <"0 towc"0 {:"1 rad
poa=:>{."1 po
fill=: 4 : 0
'r c'=.x
r=. ({.r)+ i. >: -~/ r
({.c) r}y
)
tab=:65536$0 NB. missing is N
tab=:(> {:"1 po) poa} tab
tab=:>./ ra fill"1 tab
jcreate 'temp\box\unitab'
(<tab) jappend 'temp\box\unitab'
==================================
Then you can use unitab from now on.
Save the following code in diswid.ijs
==================================
require 'jfiles'
tab=:>jread 'temp\box\unitab';0
diswidr=: [: >: [: 4&<: [: {&tab 3&u:@:ucp"1 NB.for rank 1
diswid=: diswidr`0:@.(''&-:)
===================================
Finally, we can have a verb to change a box into unicode, with
visually correct alignment. I've interspersed test codes(assert). You
may delete them. I double-checked all the tests run ok on my machine.
(Used DejaVu Sans Mono on Windows XP SP2, Korean version)
Note: Box lines are A(Ambiguous) width in unicode and I took them as
"narrow" for simplicity.
====================================
NB. Based on Roger's work at
http://www.jsoftware.com/jwiki/Essays/Boxed_Array_Display
require 'strings'
load 'temp\box\diswid.ijs'
unibox=: 3 : 0 NB. by Chris Burke
fm=. (16+i.11) { a.
msk=. y e. fm
if. -. 1 e. msk do. y return. end.
to=. 4 u: 9484 9516 9488 9500 9532 9508 9492 9524 9496 9474 9472
y=. ucp y
msk=. y e. fm
un=. to {~ fm i. msk#y
utf8 un (I.msk) } y
)
boxed =: 32 = 3!:0
mt =: 0 e. $
boxc =: 9!:6 ''
ub=:[EMAIL PROTECTED]
boxcu=:ub boxc
tcorn =: 2 0{boxcu
tint =: 1 10{boxcu
bcorn =: 8 6{boxcu
bint =: 7 10{boxcu
todisplay=:[:>(([ ,LF , ])&.>)/
displaywidth=:[:>./ ([:>./ +/"1@:diswid) &>
toplines=: 3 : 0
(< (displaywidth y) $ ub _1{boxc),y
)
te=: ucp each '──';'한'
assert te-:toplines <ucp '한'
te=: ucp each '───';'한';'abc'
assert te-:toplines ucp each '한';'abc'
leftlines=: 3 : 0
,&.>/"1 (<_2{ boxcu),. y
)
le=: ucp each '│한';'│abc';'│고'
assert le-:leftlines ucp each '한';'abc';'고'
topleftpoint=:(4 {boxc)&((<0 0)})
topleftpoint=: 3 : 0
p=. {.4 { boxcu
f=.>{.y
(<p 0}f),}.y
)
tl=: ucp each '┼──';'│bc';'│고'
assert tl-:topleftpoint ucp each '───';'│bc';'│고'
topleft=: topleftpoint @ leftlines @ toplines
e=: <@dtb;._2 ucp 0 : 0
┼───
│한
│abc
)
assert e-:topleft ucp each '한';'abc'
take=: 4 : 0 NB. x is display size (height,dwidth)
'r c'=.x
l=. r{.y
((#+(c- [:+/ diswid)) {. ' '"_^:mt) each l
)
e=: <@('. '&charsub)@dtb;._2 ucp 0 : 0
한.
...
...
)
assert 2 3 3 -: ,$&> e
assert e-:(3 3) take <ucp '한'
e=: <@('. '&charsub)@dtb;._2 ucp 0 : 0
한a..
b....
한한.
.....
)
assert 4 5 3 5 -: ,$&> e
assert e-:(4 5) take ucp each '한a';'b';'한한'
b1=: <@dtb;._2 ucp 0 : 0
de
ab
)
b2=: <@dtb;._2 ucp 0 : 0
def
abc
g한
jkl
)
b3=: <@dtb;._2 ucp 0 : 0
mno
)
ball=:b1;<b2
inside =: 1 1&}. @: ; @: (,.&.>/"1) @: (topleft&.>)
NB. todisplay ,&.>/"1 |:> topleft each (<4 3) (take&.>) ball
e=: <@('. '&charsub)@dtb;._2 ucp 0 : 0
de.│def.
ab.│abc.
...│g한.
...│jkl.
───┼────
def│mno.
abc│....
g한│....
jkl│....
...│....
)
ball2=:ball,: b2;<b3
inside=:[: 1&}. each [: 1&}. [: ; (,&.>&.>/"1) @: (topleft each)
assert e-: inside ({4 5;3 4) (take&.>) ball2
edge =: ,@(1&,.)@[ [EMAIL PROTECTED] +:@[EMAIL PROTECTED] $ ]
right =: edge&(5 9{boxcu)@>@(0&{)@[ ,~&.>"1 ]
e=: <@dtb;._2 ucp 0 : 0
aaaaa│
aaaaa┤
aaaaa│
aaaaa│
)
assert e-:(1 2;2 2) right 4$<ucp 'aaaaa'
left =: (edge )&(3 9{boxcu)@>@(0&{)@[ ,&.>"1 ]
e=: <@dtb;._2 ucp 0 : 0
│aaaaa
├aaaaa
│aaaaa
│aaaaa
)
assert e-:(1 2;2 2) left 4$<ucp 'aaaaa'
bot =: ([: < 1&|.@(bcorn&,)@(edge&bint)@>@(1&{)@[) ,"1~ ]
e=: <@dtb;._2 ucp 0 : 0
aaaaa
aaaaa
aaaaa
aaaaa
└──┴──┘
)
assert e-:(1 2;2 2) bot 4$<ucp ' aaaaa'
top =: ([:< 1&|.@(tcorn&,)@(edge&tint)@>@(1&{)@[) ,"1 ]
e=: <@dtb;._2 ucp 0 : 0
┌──┬──┐
aaaaa
aaaaa
aaaaa
aaaaa
)
assert e-:(1 2;2 2) top 4$<ucp ' aaaaa'
perim =: [ top [ bot [ left right
e=: <@dtb;._2 ucp 0 : 0
┌──┬──┐
│aaaaa│
├aaaaa┤
│aaaaa│
│aaaaa│
└──┴──┘
)
assert e-:(1 2;2 2) perim 4$<ucp 'aaaaa'
frame=:[ perim [EMAIL PROTECTED] inside@:(take&.>)"2 ,:^:(1 = [EMAIL
PROTECTED])@]
e=: <@dtb;._2 ucp 0 : 0
┌───┬────┐
│de │def │
│ab │abc │
│ │g한 │
│ │jkl │
├───┼────┤
│def│mno │
│abc│ │
│g한│ │
│jkl│ │
│ │ │
└───┴────┘
)
assert e-:(4 5;3 4) frame ball2
sh =: (*/@}: , {:)@(1&,)@$ $ ,
matsize=:$&>
matsize=: 3 : 0
r=. #y
c=. >./ ([: +/ diswid)&> y
r,c
)
assert 3 4-:matsize ucp each '한글';'abc';'a'
rc =: (>./@sh&.>) @: (,.@|:"2@:(0&{"1);1&{"1) @: (matsize&>)
ball3=: ball2 ,. ball
assert (4 4;3 3 3)-:rc ball3
bl =: }.@(,&0)@(+/)@(0&=)@(|/ [EMAIL PROTECTED]@(,&1))
mask =: 1&,. #&, ,.&0@>:@[EMAIL PROTECTED]
shline=:*/@(1&,)@$ $ , NB. into 2-dimension (connected)
assert (4$<'aa')-: shline <"1] 2 2 2 $ 'a'
rows =: */[EMAIL PROTECTED]
assert 12 6 3-: rows <"1]2 2 3 3$'a'
mat =: [EMAIL PROTECTED]@rows { a: , shline
e=: <@dtb;._2 ucp 0 : 0
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
)
assert e-:mat <"1]2 2 3 3$'a'
thorn1 =: (<@ucp"1@":)`thbox @. boxed
thbox =: (rc frame ]) @: ([EMAIL PROTECTED]&.>)
fulltest=: 2 3 $ (i.2 3) ; 'ab한c' ; (i.4 1) ; (<2 2$'ussr') ; 12 ;
<+&.>i.2 2 3
e=: <@dtb;._2 ucp 0 : 0
┌─────┬─────┬─────────┐
│0 1 2│ab한c│0 │
│3 4 5│ │1 │
│ │ │2 │
│ │ │3 │
├─────┼─────┼─────────┤
│┌──┐ │12 │┌─┬──┬──┐│
││us│ │ ││0│1 │2 ││
││sr│ │ │├─┼──┼──┤│
│└──┘ │ ││3│4 │5 ││
│ │ │└─┴──┴──┘│
│ │ │ │
│ │ │┌─┬──┬──┐│
│ │ ││6│7 │8 ││
│ │ │├─┼──┼──┤│
│ │ ││9│10│11││
│ │ │└─┴──┴──┘│
└─────┴─────┴─────────┘
)
assert e-:thbox fulltest
==============================
2007/2/13, June Kim <[EMAIL PROTECTED]>:
> I'm working on the code.
>
> In the mean time, here is the code for calculating display width:
>
> First you need to save the text file at
> http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
>
> ===============================================
> require 'regex jfiles'
> t=: 1!:1 <'EastAsianWidth.txt'
> point=:'^([0-9A-F]{4});(Na|N|H|A|W|F)' rxmatches t
> range=:'([0-9A-F]{1,4})\.\.([0-9A-F]{1,4});(Na|N|H|A|W|F)' rxmatches t
> jcreate 'unidatapoint'
> (< }."1 point rxfrom t) jappend 'unidatapoint'
> jcreate 'unidatarange'
> (< }."1 range rxfrom t) jappend 'unidatarange'
> ===============================================
>
> Now you have unidatapoint.ijf and unidatarange.ijf and are able to
use them.
>
> ===============================================
> require 'jfiles'
>
> NB. N : half
> NB. Na : half
> NB. H : half
> NB. A : half
> NB. F : full
> NB. W : full
>
> widthcode=:;: 'N Na H A F W'
> pod=:>jread 'unidatapoint';0
> rad=:>jread 'unidatarange';0
>
> towc=: widthcode&i. NB. towidthcode
>
> dfh=. 16&#. @ ('0123456789ABCDEF'&i.)
> po=:(dfh each {."1 pod),. <"0 towc"0 {:"1 pod
> ra=:(,&.>/"1 dfh each 2&{."1 rad),. <"0 towc"0 {:"1 rad
> poa=:>{."1 po
>
> fill=: 4 : 0
> 'r c'=.x
> r=. ({.r)+ i. >: -~/ r
> ({.c) r}y
> )
>
> tab=:65536$0 NB. missing is N
> tab=:(> {:"1 po) poa} tab
> tab=:>./ ra fill"1 tab
>
> diswid=: [: >: [: 4&<: [: {&tab 3&u:@ucp NB.for rank 1
> ================================================
> For performance improvement, you could save tab using jfile and use
> it. Also, you could use more compact representation(using 3 bits to
> represent each character and compress the data).
>
> Usage Example:
> diswid '한글ab!─'
> 2 2 1 1 1 1
> (,:~ ((ucp'-') $~ +/@diswid)) ucp '한글ab!-' NB. properly showing
> the top line in fixed-pitch font
> --------
> 한글ab!-
>
>
>
> 2007/2/13, Eric Iverson <[EMAIL PROTECTED]>:
> > The problem of proper display of boxed unicode data is an interesting
> > one. The first step to getting this fixed is for someone to provide a
> > working J model that takes an arbitrary boxed argument and
produces the
> > character stream that properly displays it. If we had such a model we
> > might consider incorporating it into the JE.
> >
> > ----- Original Message -----
> > From: "June Kim" <[EMAIL PROTECTED]>
> > To: "General forum" <[email protected]>
> > Sent: Sunday, February 11, 2007 5:11 AM
> > Subject: Re: [Jgeneral] wd 'set ...' with box draw characters
> >
> >
> > > 2007/2/11, Chris Burke <[EMAIL PROTECTED]>:
> > >> June Kim wrote:
> > > [snip]
> > >> > Second, the box is broken with different width
characters(that is,
> > >> > when the length of bytes of the encoding, and the width of the
> > >> > characters on display don't match). What is the usual way of
> > >> > solving
> > >> > it in other programming languages? There is a unicode
standard for
> > >> > character widths. http://unicode.org/reports/tr11/
> > >> >
> > >> > Python implements that standard(along with others) in
unicodedata
> > >> > module.
> > >> >
> > >> >>>> unicodedata.east_asian_width(u'한')
> > >> > 'W'
> > >> >>>> unicodedata.east_asian_width(u'a')
> > >> > 'Na'
> > >> >
> > >> > (u specifies the following string is unicode. east_asian_width
> > >> > returns
> > >> > the width of the character, not only for east asian
characters but
> > >> > all
> > >> > unicode characters; it's got a narrow name due to its history)
> > >> >
> > > [snip]
> > >>
> > >> If you are having problems with display, it is because of the
font,
> > >> not
> > >> because we are not using unicode.
> > > [snip]
> > >
> > > When a string is boxed and the string includes characters that have
> > > different width to the byte lenghts, then the box is broken in
J. It
> > > is not because of the font. It is because J makes an assumption
that
> > > every character's width is same with its byte length, which is
> > > obviously false in many writting+encoding systems, including east
> > > asians. We can definitely say J's box display isn't
internationalized
> > > yet.
> > >
> > > For example, 54620 (in unicode code point) is a Korean character,
> > > which is pronounced as "han". It's width is "Wide"(twice wide as
latin
> > > alphabets)
> > >
> > > han=.4 u: 54620
> > > <han
> > > +---+
> > > |한|
> > > +---+
> > > <8 u: han
> > > +---+
> > > |한|
> > > +---+
> > >
> > > Since J counts the byte length for determining character's
width, and
> > > the byte length for han is 3 in UTF-8( 3-: #8 u: han ), the box's
> > > horizontal character '-'(of which width is "Narrow") is printed
three
> > > times, and on the display the box is broken.
> > >
> >
> >
> >
--------------------------------------------------------------------------------
> >
> >
> > >
----------------------------------------------------------------------
> > > For information about J forums see
http://www.jsoftware.com/forums.htm
> >
> >
----------------------------------------------------------------------
> > For information about J forums see
http://www.jsoftware.com/forums.htm
> >
>