Read the file in the proper encoding

Garry_Galler Tue, 19 Jul 2016 16:15:03 +0200

Will it be possible in the future to add to the system a procedure for 
open\read file in the proper encoding?


While I use this variant: 
    
    
    import strutils
    proc readFile*(filePath:string,
                   sourceEncoding:string,
                   destEncoding:string = "UTF8"): TaintedString =
      var file:File
      if not open(file, filePath):
        raise newException(IOError,"Could not open file:" & filePath)
      defer:file.close()
      let text = file.readAll()
      if cmpIgnoreCase(destEncoding,sourceEncoding) != 0 :
        result = text.convert(destEncoding, sourceEncoding)
      else:
        result = text
    

* * *

And expand module encodings such functions:
    
    
    proc isUTF8*(bom:openarray[uint8]):bool =
      var s = newSeq[uint8](3)
      for i in 0..<3 : s[i]= bom[i]
      s == @[239'u8, 187'u8, 191'u8]
    
    proc isUTF16BE*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](2)
      for i in 0..<2 : s[i]= bom[i]
      s == @[254'u8, 255'u8]
    
    proc isUTF16LE*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](2)
      for i in 0..<2 : s[i]= bom[i]
      s == @[255'u8, 254'u8]
    
    proc isUTF32BE*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](4)
      for i in 0..<4 : s[i]= bom[i]
      s  == @[0'u8, 0'u8, 254'u8, 255'u8]
    
    proc isUTF32LE*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](4)
      for i in 0..<4 : s[i]= bom[i]
      s == @[255'u8, 254'u8, 0'u8, 0'u8]
    
    proc isUTFEBCDIC*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](4)
      for i in 0..<4 : s[i]= bom[i]
      s == @[221'u8, 115'u8, 102'u8, 115'u8]
    
    proc isGB18030*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](4)
      for i in 0..<4 : s[i]= bom[i]
      s == @[132'u8, 49'u8, 149'u8, 51'u8]
    
    proc isSCSU*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](3)
      for i in 0..<3 : s[i]= bom[i]
      s == @[14'u8, 254'u8, 255'u8]
    
    proc isUTF1*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](3)
      for i in 0..<3 : s[i]= bom[i]
      s == @[247'u8, 100'u8, 76'u8]
    
    proc isBOCU1*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](3)
      for i in 0..<3 : s[i]= bom[i]
      s == @[251'u8, 238'u8, 40'u8]
    
    proc isUTF7*(bom:openarray[uint8]):bool=
      var s = newSeq[uint8](16)
      for i in 0..<16 : s[i]= bom[i]
      s == @[43'u8, 47'u8, 118'u8, 56'u8, 43'u8, 47'u8, 118'u8, 57'u8, 43'u8, 
47'u8, 118'u8, 43'u8, 43'u8, 47'u8, 118'u8, 47'u8]
    
    proc isUTF16*(bom:openarray[uint8]):bool =
      return isUTF16BE(bom) or isUTF16LE(bom)
    
    proc isUTF32*(bom:openarray[uint8]):bool =
      return isUTF32BE(bom) or isUTF32LE(bom)
    
    
    import strutils, encodings
    
    proc getFileEncoding*(filePath:string):string=
      var
        bytes = 16
        box:array[16,uint8]
        file:File
      if not open(file,filePath):
        raise newException(IOError,"Could not open file:" & filePath)
      defer:file.close()
      discard file.readBytes(box,0,bytes)
      result = ""
      if isUTF32BE(box):   return "UTF-32BE"
      if isUTF32LE(box):   return "UTF-32LE"
      if isUTF16BE(box):   return "UTF-16BE"
      if isUTF16LE(box):   return "UTF-16LE"
      if isUTF8(box):      return "UTF-8"
      if isUTFEBCDIC(box): return "UTF-EBCDIC"
      if isGB18030(box):   return "GB-18030"
      if isSCSU(box):      return "SCSU"
      if isUTF1(box):      return "UTF-1"
      if isBOCU1(box):     return "BOCU-1"
      if isUTF7(box):      return "UTF-7"
    

PS: However, in fact, the module type **chardet** (Python) would be even more 
powerful. :)

Read the file in the proper encoding

Reply via email to