Re: [containers-users] Possible additions to Containers and Friends

Simon Cruanes Sat, 10 Feb 2018 11:54:57 -0800

Hi Peter,

Thanks for the suggestions. I'm no expert in unicode, but I do agree
that such basic functionalities should be more easily available.
Maybe a `Ustring` module in containers would make sense (as a private
alias to `string`); most functionalities below would fit there, I think.


Would you consider opening PRs against gen, sequence, and containers
repositories so we can discuss that without spamming the list? I can
help if needed, or do it myself.
I'm interested in tests too, but will probably write some (possibly
using Uutf as a reference); some of the tests you wrote below I can
retrofit in the qtest mechanism.

Cheers!


Le Sat, 10 Feb 2018, peter frey wrote:
> (*
> Reading recent posts on discuss.ocal.org gives me the impression that some
> tiny
> number of utf related routines should be more easily available.
> Container's Sequence.t and Gen.t, in particular could benefit from a couple
> of
> simple routines.  The code below fits well into that frame work.
> I am treating it as public domain code but feel free to make it your own and
> to
> include it where appropriate. (Perhaps some of the tests could go into the
> example directory...)
> The routines here DO NOT verify unless its unavoidable.  In particular they
> accept ALL code points that can be encoded by the original Utf8 definition.
> It is only a matter of language; we could call it utf31 ...
> Restricting the range is trivial; as would be including some verification
> code.
> 
> 
> 
> *)
> 
> open Containers
> 
> (* Create a generator from a utf8-string. Each call produces a code point.
>  * The optional parameter srcIdx specifies then start point in the string.
>  * srcIdx must point to a valid suffix of a utf8 string.
>  * *)
> let gen_of_utf8 ?(srcIdx=ref 0) str =
>     let lim = String.length str in
>     let assemble_next () =     (* we come here only for multi-byte
> characters *)
>       let cv jmax accu =      (* utf8 character length; construction of
> uchar *)
>         let rec cv' j accu' =          (* inner loop j = 1..jmax ; each
> uchar *)
>           let ch = Char.code str.[ !srcIdx + j] in
>           let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>           if j = jmax then begin     (* except for 1st, each char gives 6
> bits*)
>             srcIdx := !srcIdx + j +1; Some next            (* +1 for 1st
> char *)
>           end else cv' (succ j) next
>       in cv' 1  (* 1st char is already proccessed! *) accu
>     in if !srcIdx >= lim then None else
>     let n = str.[ !srcIdx ] in match n with
>     (* 0xxxxxxx *) | '\000' .. '\127' -> incr srcIdx; Some (int_of_char n)
>     (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>     (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>     (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>     (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>     (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>     (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>   in  assemble_next;;
> 
> 
> (* The 'natural' stream representation of a utf-string is a generator.
>  * But Sequences are not far away ... *)
> let makeUtf8Seq ?(srcIdx=ref 0) str = Sequence.of_gen (gen_of_utf8 ~srcIdx
> str)
> 
> 
> 
> (* Convert a code point to a string; Hopefully some day this will be in the
>  * standard library. There are various equally trivial versions of this
> around.
>  * The returned string is created (allocated) fresh for each k.
>  * *)
> 
> let code_to_string k =
>   let mask = 0b111111 in
>   if k < 0 || k >= 0x4000000 then begin
>     let s = Bytes.create 6 in
>     Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
> mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor (k land mask)));
>     s end
>   else if k <= 0x7f then
>     Bytes.make 1 (Char.unsafe_chr k)
>   else if k <= 0x7ff then begin
>     let s = Bytes.create 2 in
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>     s end
>   else if k <= 0xffff then begin
>     let s = Bytes.create 3 in
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>     s end
>   else if k <= 0x1fffff then begin
>     let s = Bytes.create 4 in
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>     s end
>   else begin
>     let s = Bytes.create 5 in
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>     s end
> 
> let string_to_code str =
>     let cv jmax accu =      (* utf8 character length; construction of uchar
> *)
>     if jmax >  String.length str then raise (Failure "string_to_code")
>     else let rec cv' j accu' =       (* inner loop j = 1..jmax ; each uchar
> *)
>       let ch = Char.code (String.unsafe_get str j) in
>       let next = ( (accu' lsl 6) lor ( ch land 0x7f )) in
>       if j = jmax then next else cv' (succ j) next
>       in cv' 1  (* 1st char is already proccessed! *) accu
>     in let n = str.[0] in match n with
>     (* 0xxxxxxx *) | '\000' .. '\127' -> int_of_char n
>     (* 110yyyyy *) | '\128' .. '\223' -> cv 1 ((Char.code n) land 0b11111 )
>     (* 1110zzzz *) | '\224' .. '\239' -> cv 2 ((Char.code n) land 0b1111 )
>     (* 11110uuu *) | '\240' .. '\247' -> cv 3 ((Char.code n) land 0b111 )
>     (* 111110vv *) | '\248' .. '\251' -> cv 4 ((Char.code n) land 0b11 )
>     (* 1111110w *) | '\252' .. '\253' -> cv 5 ((Char.code n) land 0b1 )
>     (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
> 
> 
> (* code_into_string over-writes string s which must be 7-byte string.
>  * n-byte String ends with '\000' which is set as needed (in case you feed
> it
>  * to a c-program). The last byte of a string contains the # of unused bytes
> in
>  * then string. It is set here, for example, by "Bytes.unsafe_set s 6
> '\000'"
>  * If the string is longer than 1 word (plus header) all hell breaks loose.
>  * Use only if the string is copied afterwards. (Buffer.add_string ... etc)
>  * DUBIOUS (and about twice as fast because allocation is not needed)
>  * js_of_ocaml might be unhappy with this...
> *)
> let code_into_string s k =
>   let mask = 0b111111 in
>   if k < 0 || k >= 0x4000000 then begin
>     Bytes.unsafe_set s 0 (Char.chr (0xfc + (k lsr 30)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 24) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
> mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 5 (Char.unsafe_chr (0x80 lor ( k land mask)));
>     Bytes.unsafe_set s 6 '\000';                   (* string internals s/b
> OK *)
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  1 );   (* string internals
> DUBIOUS *)
>     () end
>   else if k <= 0x7f then begin
>     Bytes.unsafe_set s 0 (Char.chr k);
>     Bytes.unsafe_set s 1 '\000';
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  6 );
>     () end
>   else if k <= 0x7ff then begin
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xc0 lor (k lsr 6)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor (k land mask)));
>     Bytes.unsafe_set s 2 '\000';
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  5 );
>     () end
>   else if k <= 0xffff then begin
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xe0 lor (k lsr 12)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor (k land mask)));
>     Bytes.unsafe_set s 3 '\000';
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  4 );
>     () end
>   else if k <= 0x1fffff then begin
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf0 + (k lsr 18)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor (k land mask)));
>     Bytes.unsafe_set s 4 '\000';
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  3 );
>     () end
>   else begin
>     Bytes.unsafe_set s 0 (Char.unsafe_chr (0xf8 + (k lsr 24)));
>     Bytes.unsafe_set s 1 (Char.unsafe_chr (0x80 lor ((k lsr 18) land
> mask)));
>     Bytes.unsafe_set s 2 (Char.unsafe_chr (0x80 lor ((k lsr 12) land
> mask)));
>     Bytes.unsafe_set s 3 (Char.unsafe_chr (0x80 lor ((k lsr 6) land mask)));
>     Bytes.unsafe_set s 4 (Char.unsafe_chr (0x80 lor (k land mask)));
>     Bytes.unsafe_set s 5 '\000';
>     Bytes.unsafe_set s 7 (Char.unsafe_chr  2 );
>     () end
> 
> (* Automaton to map a (utf8) char Sequence.t into an int Sequence.t
>    Accept up to 6 characters, converting them to an integer to feed to k.
>    This code demonstrates that being at the receiving end of a sequence can
>    cause hardship ... It needs often a state-machine
>    ... and perhaps a better name  *)
>   let mapUtf8Char2Code (k: int -> unit) =
>     let rec next = ref first
>     and accu = ref 0
>     and first ch = match ch with
>       (* 0xxxxxxx *) | '\000' .. '\127' -> k (int_of_char ch); accu := 0
>       (* 110yyyyy *) | '\128' .. '\223' -> cv s1 ch 0b11111
>       (* 1110zzzz *) | '\224' .. '\239' -> cv s2 ch 0b1111
>       (* 11110uuu *) | '\240' .. '\247' -> cv s3 ch 0b111
>       (* 111110vv *) | '\248' .. '\251' -> cv s4 ch 0b11
>       (* 1111110w *) | '\252' .. '\253' -> cv s5 ch 0b1
>       (* 1111111X *) | '\254' .. '\255' -> raise (Failure "Bad stream")
>     and inline ch = accu := (!accu lsl 6) lor ((Char.code ch) land 0x7f )
>     and s1 ch = inline ch; next := first; k !accu
>     and s2 ch = inline ch; next := s1 and s3 ch = inline ch; next := s2
>     and s4 ch = inline ch; next := s3 and s5 ch = inline ch; next := s4
>     and cv startState initialValue mask =
>         next := startState;
>         accu := (Char.code initialValue) land mask
>   in (fun c -> !next c) ;;
> 
> let code_len k =
>   if k < 0 || k >= 0x4000000 then 6 else if k <= 0x7f then 1
>   else if k <= 0x7ff then 2 else if k <= 0xffff then 3
>   else if k <= 0x1fffff then 4 else 5
> 
> 
> (*
> (*
> ===========================================================================
>  *           various tests of above
> *)
> 
> 
> (* measure.ml
>  * included to avoid dependency on some other (more capable) measuring tool
>  *                                *)
> let measure fn arg (units:int)  comment =
>   (* arg is last argument of fn (or unit not belonging to fn);
>      argument units is only used for ratio (units /. elapsed) *)
>   let start = Unix.gettimeofday() in
>   let res = fn arg in
>   let endt = Unix.gettimeofday() in let elapsed = endt -. start in
>   let open Printf in
>   printf"\n%s\n\tTime:%f Units:%i Units/sec:%s uSecs/Unit:%s\n%!"
>         (sprintf"Measured <<%s>>" comment)
>         elapsed units
>         ( if units <= 1 then "N/A" else
>           sprintf"%8.0f" ((float_of_int units) /. elapsed ) )
>         ( if units <= 1 then "N/A" else
>           sprintf"%2.8f" (elapsed /. (float_of_int units) *. 1000000.0 ) );
>   res
> 
> 
> open Printf
> module S = Sequence
> module V = Vector
> 
> let code_to_string_test last =                              (*
> code_to_string *)
>   for i = 0 to last do ignore (code_to_string i)  done ;;
> 
> let code_into_string_test last =                          (*
> code_into_string *)
>   let str = Bytes.create 6 in
>   for i = 0 to last do ignore (code_into_string str i)  done ;;
> 
> let round_about2 last =                            (* code_to_string and
> back *)
>   for i = 0 to last do
>     let str = code_to_string i in (* convert code point to char string *)
>     let j = string_to_code str in  (* convert the char string back to code
> *)
>     assert( i = j)
>   done
> 
> let round_about3 last =                          (* code_into_string and
> back *)
>   let len = ref 1 in
>   let str = Bytes.create 6 in
>   for i = 0 to last do
>     code_into_string str i; (* convert code point to char string *)
>     let j = string_to_code str in  (* convert the char string back to code
> *)
>     assert( i = j);
>     if code_len i <> !len then (printf"New len at %x:%s\n" i str;
>                                len := code_len i )
>   done
> 
> 
> let make_big_string last = (* create a utf8 string of consecutive code
> points *)
>   let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>   let str = Bytes.create 6 in
>   for i = 0 to last do
>     code_into_string str i;
>     Buffer.add_string buf str;
>   done;
>   Buffer.contents buf
>   ;;
> 
> let make_big_string last = (* create a utf8 string of consecutive code
> points *)
>   let buf = Buffer.create (16 * 1024 * 1024 ) in (* ocaml tolerates large *)
>   let str = Bytes.create 7 in
>   for i = 0 to last do
>     code_into_string str i;
>     Buffer.add_string buf str;
>   done;
>   let b = Buffer.contents buf in
>   printf"\n\ncreated %i strings. \nfinal length: %i bytes. \
>          \nAvg len %f\n%!"
>          last
>          (String.length b)
>          ((float_of_int (String.length b)) /. (float_of_int last));
>   b
> 
> let decode_big big =              (* create a utf8 string; convert to int
> S.t *)
>   let check = ref 0 in             (* map char S.t to int S.t and test
> result *)
>   (String.to_seq big)
>   (mapUtf8Char2Code (fun j -> assert (j = !check) ; incr check))
> 
> 
> let last1 = 0x3ffffff;; (*    67_108_863 *)
> let last2 = 0x7fffffff;; (* 1073_741_823 *)
> let last = last2 (* last2 uses ALL code points and takes a while *)
> 
> (* Sample: Convert a utf8 char string to a Vector *)
> let utf8_to_vector str = str |> gen_of_utf8 |> V.of_gen
> 
> let _ = printf "\"SKサイトリf\" |> utf8_to_vector \n\t |> Vector.to_seq |>
> S.map code_to_string |> fun seq -> seq (printf\"%s\t\")\n!";;
> 
> "SKサイトリf" |> utf8_to_vector |> Vector.to_seq |> S.map code_to_string |>
>  fun seq -> seq (printf"%s\t");;
> 
> (* Test below does not work for last 2 because it allocates a to big string
> *)
> let big = measure make_big_string last1 last1 "make_big_string";;
> measure decode_big big last1 "decode_big";;
> 
> (* Here last2 encodes/decodes ALL possible code points *)
> measure round_about2 last last "round_about2";;
> measure round_about3 last last "round_about3";;
> measure code_into_string_test last last "code_into_string";;
> measure code_to_string_test last last "code_to_string";;
> 
> let _ = (* One can never have to many samples ... *)
>  printf"\n\n    "; S.( 0 -- 15) (fun i -> printf" %2x" i);
>  S.( 0 -- 15)
>   (fun i -> printf"\n%4x  " (i * 16 + 0x2500);
>     S.( (0x2500 + i * 16) -- (0x250F + i * 16))
>     |> S.map code_to_string
>     |> S.to_list |> String.concat "  " |> print_string
>   );;
> *)
> 


-- 
Simon Cruanes

http://weusepgp.info/
key 49AA62B6, fingerprint 949F EB87 8F06 59C6 D7D3  7D8D 4AC0 1D08 49AA 62B6

signature.asc
Description: PGP signature

_______________________________________________
Containers-users mailing list
Containers-users@lists.ocaml.org
http://lists.ocaml.org/listinfo/containers-users

Re: [containers-users] Possible additions to Containers and Friends

Reply via email to