Hi Peter, Thanks for the suggestions. I'm no expert in unicode, but I do agree that such basic functionalities should be more easily available. Maybe a `Ustring` module in containers would make sense (as a private alias to `string`); most functionalities below would fit there, I think.
Would you consider opening PRs against gen, sequence, and containers repositories so we can discuss that without spamming the list? I can help if needed, or do it myself. I'm interested in tests too, but will probably write some (possibly using Uutf as a reference); some of the tests you wrote below I can retrofit in the qtest mechanism. Cheers! Le Sat, 10 Feb 2018, peter frey wrote: > (* > Reading recent posts on discuss.ocal.org gives me the impression that some > tiny > number of utf related routines should be more easily available. > Container's Sequence.t and Gen.t, in particular could benefit from a couple > of > simple routines. The code below fits well into that frame work. > I am treating it as public domain code but feel free to make it your own and > to > include it where appropriate. (Perhaps some of the tests could go into the > example directory...) > The routines here DO NOT verify unless its unavoidable. In particular they > accept ALL code points that can be encoded by the original Utf8 definition. > It is only a matter of language; we could call it utf31 ... > Restricting the range is trivial; as would be including some verification > code. > > > > *) > > open Containers > > (* Create a generator from a utf8-string. Each call produces a code point. > * The optional parameter srcIdx specifies then start point in the string. > * srcIdx must point to a valid suffix of a utf8 string. > * *) > let gen_of_utf8 ?(srcIdx=ref 0) str = > let lim = String.length str in > let assemble_next () = (* we come here only for multi-byte > characters *) > let cv jmax accu = (* utf8 character length; construction of > uchar *) > let rec cv' j accu' = (* inner loop j = 1..jmax ; each > uchar *) > let ch = Char.code str.[ !srcIdx + j] in > let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in > if j = jmax then begin (* except for 1st, each char gives 6 > bits*) > srcIdx := !srcIdx + j +1; Some next (* +1 for 1st > char *) > end else cv' (succ j) next > in cv' 1 (* 1st char is already proccessed! *) accu > in if !srcIdx >= lim then None else > let n = str.[ !srcIdx ] in match n with > (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n) > (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 ) > (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 ) > (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 ) > (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 ) > (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 ) > (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream") > in assemble_next;; > > > (* The 'natural' stream representation of a utf-string is a generator. > * But Sequences are not far away ... *) > let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 ~srcIdx > str) > > > > (* Convert a code point to a string; Hopefully some day this will be in the > * standard library. There are various equally trivial versions of this > around. > * The returned string is created (allocated) fresh for each k. > * *) > > let code_to_string k = > let mask = 0b111111 in > if k < 0 || k >= 0x4000000 then begin > let s = Bytes.create 6 in > Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land > mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask))); > s end > else if k <= 0x7f then > Bytes.make 1 (Char.unsafe_chr k) > else if k <= 0x7ff then begin > let s = Bytes.create 2 in > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask))); > s end > else if k <= 0xffff then begin > let s = Bytes.create 3 in > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask))); > s end > else if k <= 0x1fffff then begin > let s = Bytes.create 4 in > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask))); > s end > else begin > let s = Bytes.create 5 in > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask))); > s end > > let string_to_code str = > let cv jmax accu = (* utf8 character length; construction of uchar > *) > if jmax > String.length str then raise (Failure "string_to_code") > else let rec cv' j accu' = (* inner loop j = 1..jmax ; each uchar > *) > let ch = Char.code (String.unsafe_get str j) in > let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in > if j = jmax then next else cv' (succ j) next > in cv' 1 (* 1st char is already proccessed! *) accu > in let n = str.[0] in match n with > (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n > (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 ) > (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 ) > (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 ) > (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 ) > (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 ) > (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream") > > > (* code_into_string over-writes string s which must be 7-byte string. > * n-byte String ends with '\000' which is set as needed (in case you feed > it > * to a c-program). The last byte of a string contains the # of unused bytes > in > * then string. It is set here, for example, by "Bytes.unsafe_set s 6 > '\000'" > * If the string is longer than 1 word (plus header) all hell breaks loose. > * Use only if the string is copied afterwards. (Buffer.add_string ... etc) > * DUBIOUS (and about twice as fast because allocation is not needed) > * js_of_ocaml might be unhappy with this... > *) > let code_into_string s k = > let mask = 0b111111 in > if k < 0 || k >= 0x4000000 then begin > Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land > mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask))); > Bytes.unsafe_set s 6 '\000'; (* string internals s/b > OK *) > Bytes.unsafe_set s 7 (Char.unsafe_chr 1 ); (* string internals > DUBIOUS *) > () end > else if k <= 0x7f then begin > Bytes.unsafe_set s 0 (Char.chr k); > Bytes.unsafe_set s 1 '\000'; > Bytes.unsafe_set s 7 (Char.unsafe_chr 6 ); > () end > else if k <= 0x7ff then begin > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask))); > Bytes.unsafe_set s 2 '\000'; > Bytes.unsafe_set s 7 (Char.unsafe_chr 5 ); > () end > else if k <= 0xffff then begin > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask))); > Bytes.unsafe_set s 3 '\000'; > Bytes.unsafe_set s 7 (Char.unsafe_chr 4 ); > () end > else if k <= 0x1fffff then begin > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask))); > Bytes.unsafe_set s 4 '\000'; > Bytes.unsafe_set s 7 (Char.unsafe_chr 3 ); > () end > else begin > Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24))); > Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land > mask))); > Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land > mask))); > Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask))); > Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask))); > Bytes.unsafe_set s 5 '\000'; > Bytes.unsafe_set s 7 (Char.unsafe_chr 2 ); > () end > > (* Automaton to map a (utf8) char Sequence.t into an int Sequence.t > Accept up to 6 characters, converting them to an integer to feed to k. > This code demonstrates that being at the receiving end of a sequence can > cause hardship ... It needs often a state-machine > ... and perhaps a better name *) > let mapUtf8Char2Code (k: int -> unit) = > let rec next = ref first > and accu = ref 0 > and first ch = match ch with > (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0 > (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111 > (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111 > (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111 > (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11 > (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1 > (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream") > and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f ) > and s1 ch = inline ch; next := first; k !accu > and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2 > and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4 > and cv startState initialValue mask = > next := startState; > accu := (Char.code initialValue) land mask > in (fun c -> !next c) ;; > > let code_len k = > if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1 > else if k <= 0x7ff then 2 else if k <= 0xffff then 3 > else if k <= 0x1fffff then 4 else 5 > > > (* > (* > =========================================================================== > * various tests of above > *) > > > (* measure.ml > * included to avoid dependency on some other (more capable) measuring tool > * *) > let measure fn arg (units:int) comment = > (* arg is last argument of fn (or unit not belonging to fn); > argument units is only used for ratio (units /. elapsed) *) > let start = Unix.gettimeofday() in > let res = fn arg in > let endt = Unix.gettimeofday() in let elapsed = endt -. start in > let open Printf in > printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!" > (sprintf"Measured <<%s>>" comment) > elapsed units > ( if units <= 1 then "N/A" else > sprintf"%8.0f" ((float_of_int units) /. elapsed ) ) > ( if units <= 1 then "N/A" else > sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) ); > res > > > open Printf > module S = Sequence > module V = Vector > > let code_to_string_test last = (* > code_to_string *) > for i = 0 to last do ignore (code_to_string i) done ;; > > let code_into_string_test last = (* > code_into_string *) > let str = Bytes.create 6 in > for i = 0 to last do ignore (code_into_string str i) done ;; > > let round_about2 last = (* code_to_string and > back *) > for i = 0 to last do > let str = code_to_string i in (* convert code point to char string *) > let j = string_to_code str in (* convert the char string back to code > *) > assert( i = j) > done > > let round_about3 last = (* code_into_string and > back *) > let len = ref 1 in > let str = Bytes.create 6 in > for i = 0 to last do > code_into_string str i; (* convert code point to char string *) > let j = string_to_code str in (* convert the char string back to code > *) > assert( i = j); > if code_len i <> !len then (printf"New len at %x:%s\n" i str; > len := code_len i ) > done > > > let make_big_string last = (* create a utf8 string of consecutive code > points *) > let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *) > let str = Bytes.create 6 in > for i = 0 to last do > code_into_string str i; > Buffer.add_string buf str; > done; > Buffer.contents buf > ;; > > let make_big_string last = (* create a utf8 string of consecutive code > points *) > let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *) > let str = Bytes.create 7 in > for i = 0 to last do > code_into_string str i; > Buffer.add_string buf str; > done; > let b = Buffer.contents buf in > printf"\n\ncreated %i strings. \nfinal length: %i bytes. \ > \nAvg len %f\n%!" > last > (String.length b) > ((float_of_int (String.length b)) /. (float_of_int last)); > b > > let decode_big big = (* create a utf8 string; convert to int > S.t *) > let check = ref 0 in (* map char S.t to int S.t and test > result *) > (String.to_seq big) > (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check)) > > > let last1 = 0x3ffffff;; (* 67_108_863 *) > let last2 = 0x7fffffff;; (* 1073_741_823 *) > let last = last2 (* last2 uses ALL code points and takes a while *) > > (* Sample: Convert a utf8 char string to a Vector *) > let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen > > let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |> > S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";; > > "SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |> > fun seq -> seq (printf"%s\t");; > > (* Test below does not work for last 2 because it allocates a to big string > *) > let big = measure make_big_string last1 last1 "make_big_string";; > measure decode_big big last1 "decode_big";; > > (* Here last2 encodes/decodes ALL possible code points *) > measure round_about2 last last "round_about2";; > measure round_about3 last last "round_about3";; > measure code_into_string_test last last "code_into_string";; > measure code_to_string_test last last "code_to_string";; > > let _ = (* One can never have to many samples ... *) > printf"\n\n "; S.( 0 -- 15) (fun i -> printf" %2x" i); > S.( 0 -- 15) > (fun i -> printf"\n%4x " (i * 16 + 0x2500); > S.( (0x2500 + i * 16) -- (0x250F + i * 16)) > |> S.map code_to_string > |> S.to_list |> String.concat " " |> print_string > );; > *) > -- Simon Cruanes http://weusepgp.info/ key 49AA62B6, fingerprint 949F EB87 8F06 59C6 D7D3 7D8D 4AC0 1D08 49AA 62B6
signature.asc
Description: PGP signature
_______________________________________________ Containers-users mailing list Containers-users@lists.ocaml.org http://lists.ocaml.org/listinfo/containers-users