Hi, I'm creating some sorting tables. While researching this topic I stumbled on the Polish dictionary sorting rules: if two strings are equal except for case then the one gets precedence that begins lowercase.[1] (This seems to apply to the Swedish order as well but I have no means to verify that. Apparently, my German dictionary (from 1991) follows the same rule without explicitly stating so.)
Context seems to prefer it the other way round, so I modified two functions from sort-ini.lua to handle that; but I'm not happy with this solution. So my question: is there already, or could we have some mechanism to influence the details of sorting in context? Thanks for your help, Philipp [1] <ftp://ftp.gust.org.pl/pub/GUST/bulletin/03/02-bl.pdf>, p. 7. -- () ascii ribbon campaign - against html e-mail /\ www.asciiribbon.org - against proprietary attachments
--- testing environment for sorters dofile "polishsort.lua" document.whatever = { } document.whatever.words = { } local my = {} function my.gsub (s, patt, repl) patt = lpeg.S(patt) patt = lpeg.Cs((patt / repl + 1)^0) return lpeg.match(patt, s) end --- based on http://www.mail-archive.com/ntg-context@ntg.nl/msg47525.html function document.whatever.sorttext() local dwtext = document.whatever.text --local split = sorters.splitters.utf local split = sorters.splitters.utflower dwtext = my.gsub(dwtext, '\n\t\v"', " ") dwtext = string.explode(dwtext, " +") local dwwords = document.whatever.words for i=1, #dwtext do local current = string.strip(dwtext[i]) if current ~= "" then table.insert(dwwords, { word = current }) end end for i=1, #dwwords do local word = dwwords[i] word.split = split(word.word) end --sorters.sort(dwwords, sorters.comparers.basic) sorters.sort(dwwords, sorters.comparers.polish) end function document.whatever.flushtext() local words = document.whatever.words local previous = false local p_word = false for i=1, #words do local word = words[i] local letter, current = sorters.firstofsplit(word) local letter = utf.lower(letter) if previous ~= current then previous = current context.section(letter) end local c_word = word.word if p_word ~= c_word then context(tostring(i) .. ": " .. c_word) context.par() p_word = c_word end end end function testrun (lang) --f = assert(io.open("anna-utf.txt", "r")) --f = assert(io.open("sltext.txt", "r")) document.whatever.text = [[ polskie słowa dziwnie się szereguje Polskie Słowa Dziwnie Się Szereguje ]] sorters.setlanguage(lang) context.starttext() document.whatever.sorttext() document.whatever.flushtext() context.stoptext() end testrun("pl")
--- Polish sorting (including the letters q, v, x) sorters.replacements["pl"] = {} sorters.entries["pl"] = { ["a"] = "a", ["ą"] = "ą", ["b"] = "b", ["c"] = "c", ["ć"] = "ć", ["d"] = "d", ["e"] = "e", ["ę"] = "ę", ["f"] = "f", ["g"] = "g", ["h"] = "h", ["i"] = "i", ["j"] = "j", ["k"] = "k", ["l"] = "l", ["ł"] = "ł", ["m"] = "m", ["n"] = "n", ["ń"] = "ń", ["o"] = "o", ["ó"] = "ó", ["p"] = "p", ["q"] = "q", ["r"] = "r", ["s"] = "s", ["ś"] = "ś", ["t"] = "t", ["u"] = "u", ["v"] = "v", ["w"] = "w", ["x"] = "x", ["y"] = "y", ["z"] = "z", ["ź"] = "ź", ["ż"] = "ż", } sorters.mappings["pl"] = { ["a"] = 1, ["ą"] = 2, ["b"] = 3, ["c"] = 4, ["ć"] = 5, ["d"] = 6, ["e"] = 7, ["ę"] = 8, ["f"] = 9, ["g"] = 10, ["h"] = 11, ["i"] = 12, ["j"] = 13, ["k"] = 14, ["l"] = 15, ["ł"] = 16, ["m"] = 17, ["n"] = 18, ["ń"] = 19, ["o"] = 20, ["ó"] = 21, ["p"] = 22, ["q"] = 23, ["r"] = 24, ["s"] = 25, ["ś"] = 26, ["t"] = 27, ["u"] = 28, ["v"] = 29, ["w"] = 30, ["x"] = 31, ["y"] = 32, ["z"] = 33, ["ź"] = 34, ["ż"] = 35, } local currentreplacements = sorters.replacements["pl"] or {} local currentmappings = sorters.mappings["pl"] or {} local currententries = sorters.entries["pl"] or {} local utfcharacters = string.utfcharacters local utfbyte = utf.byte -- unchanged, needs to be in local scope local function basicsort(sort_a,sort_b) if not sort_a or not sort_b then return 0 elseif #sort_a > #sort_b then if #sort_b == 0 then return 1 else for i=1,#sort_b do local ai, bi = sort_a[i], sort_b[i] if ai > bi then return 1 elseif ai < bi then return -1 end end return 1 end elseif #sort_a < #sort_b then if #sort_a == 0 then return -1 else for i=1,#sort_a do local ai, bi = sort_a[i], sort_b[i] if ai > bi then return 1 elseif ai < bi then return -1 end end return -1 end elseif #sort_a == 0 then return 0 else for i=1,#sort_a do local ai, bi = sort_a[i], sort_b[i] if ai > bi then return 1 elseif ai < bi then return -1 end end return 0 end end -- modified from sorters.comparers.basic(str) function sorters.comparers.polish(a,b) local ea, eb = a.split, b.split local na, nb = #ea, #eb if na == 0 and nb == 0 then -- simple variant (single word) local result = basicsort(ea.e,eb.e) if result == 0 then if eb.first_lower and not ea.first_lower then return 1 elseif ea.first_lower and not eb.first_lower then return -1 else return 0 end else return basicsort(ea.m, eb.m) end else -- complex variant, used in register (multiple words) local result = 0 for i=1,nb < na and nb or na do local eai, ebi = ea[i], eb[i] result = basicsort(eai.e,ebi.e) if result == 0 then result = basicsort(eai.m,ebi.m) -- only needed it there are m's end if result ~= 0 then break end end if result ~= 0 then return result elseif na > nb then return 1 elseif nb > na then return -1 else if eb[1].first_lower and not ea[1].first_lower then return 1 elseif ea[1].first_lower and not eb[1].first_lower then return -1 else return 0 end end end end -- modified from sorters.splitters.utf(str) function sorters.splitters.utflower(str) local first_char = utf.sub(str,1,1) str = utf.lower(str) if #currentreplacements > 0 then for k=1,#currentreplacements do local v = currentreplacements[k] str = gsub(str,v[1],v[2]) end end local s, e, m, n = { }, { }, { }, 0 for sc in utfcharacters(str) do -- maybe an lpeg local ec, mc = currententries[sc], currentmappings[sc] or utfbyte(sc) n = n + 1 s[n] = sc e[n] = currentmappings[ec] or mc m[n] = mc end return { s = s, e = e, m = m, first_lower = first_char == utf.lower(first_char) } end
pgpppyqfxgWKU.pgp
Description: PGP signature
___________________________________________________________________________________ If your question is of interest to others as well, please add an entry to the Wiki! maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context webpage : http://www.pragma-ade.nl / http://tex.aanhet.net archive : http://foundry.supelec.fr/projects/contextrev/ wiki : http://contextgarden.net ___________________________________________________________________________________