Byte representation of unicode string

This is python3 code:

>>> bytes(json.dumps({'Ä':0}), "utf-8")
b'{"\u00c4": 0}'

json.dumps() returns unicode string and bytes() returns its’ bytes representation – string encoded into utf-8.

How do I achieve the same result in Lua? I need a bytes representation of a json object which contains non-ascii chars.

Answer

You have to do it manually.

local function utf8_to_unicode(utf8str, pos)
   local code, size = utf8str:byte(pos), 1
   if code >= 0xC0 and code < 0xFE then
      local mask = 64
      code = code - 128
      repeat
         local next_byte = utf8str:byte(pos + size) or 0
         if next_byte >= 0x80 and next_byte < 0xC0 then
            code, size = (code - mask - 2) * 64 + next_byte, size + 1
         else
            code, size = utf8str:byte(pos), 1
         end
         mask = mask * 32
      until code < mask
   end
   -- returns code, number of bytes in this utf8 char
   return code, size
end

function utf8_to_python(utf8str)
   local pos = 1
   local z = ''
   while pos <= #utf8str do
      local unicode, size = utf8_to_unicode(utf8str, pos)
      pos = pos + size
      if unicode < 0x80 then
         z = z..string.char(unicode)
      elseif unicode < 0x10000 then
         z = z..string.format('\\u%04x', unicode)
      else
         z = z..string.format('\\U%08x', unicode)
      end
   end
   return z
end

Usage:

local json = require('json')
local x = {['Ä'] = 0}
local y = json.encode(x)
print(y)                       -->  {"Ä":0}
local z = utf8_to_python(y)
print(z)                       -->  {"\u00c4":0}

Leave a Reply

Your email address will not be published. Required fields are marked *