Differences between revisions 1 and 2
Revision 1 as of 2010-11-06 08:08:11
Size: 3189
Editor: ZoomQuiet
Comment:
Revision 2 as of 2010-11-06 08:10:37
Size: 4200
Editor: ZoomQuiet
Comment:
Deletions are marked like this. Additions are marked like this.
Line 35: Line 35:
3266- fatal: {error,{wfc_Legal_Character,{error,{bad_character,37070}}}}
** exception exit: {fatal,{{error,{wfc_Legal_Character,{error,{bad_character,37070}}}},
3266- fatal: {error,{wfc_Legal_Character,{error,{bad_character,37070}} }}
** exception exit: {fatal,{{error,{wfc_Legal_Character,{error,{bad_character,37070}} }},
Line 64: Line 64:
,{wfc_Legal_Character,{error,{bad_character,37070}}}} ,{wfc_Legal_Character,{error,{bad_character,37070}} }}
Line 68: Line 68:
                                {error,{bad_character,37070}}}},                                 {error,{bad_character,37070}} }},
Line 95: Line 95:
=== unicode ===
昨天用xmerl_scan:string/1 解析xml,发现xmerl_scan:string/1返回的是unicode,如中国的“中”,经过xmerl_scan:string/1处理后,返回的是"[10023]".继续传递的时候到了ejabberd_odbc.erl中,就出错了。因为他没有做处理。

解决此问题有两种方法
 * 1.修改 程序代码 对xmerl_scan:string/1返回值进行编码
 * 2.修改ejabberd_odbc.erl对传入的参数进行编码


{{{
#!diff
Index: odbc/ejabberd_odbc.erl
===================================================================
--- odbc/ejabberd_odbc.erl (版本 1)
+++ odbc/ejabberd_odbc.erl (版本 2)
@@ -150,7 +150,11 @@
 
 %% Escape character that will confuse an SQL engine
 escape(S) when is_list(S) ->
- [odbc_queries:escape(C) || C <- S].
+ %% to handle unicode codepoint to utf-8 string
+ UTF8Binary = unicode:characters_to_binary(S, utf8, utf8),
+ UTF8Str = binary_to_list(UTF8Binary),
+ [odbc_queries:escape(C) || C <- UTF8Str].
}}}

erl:xmerl_scan:string/2 对汉字的处理

问题

郎咸武 <[email protected]>
回复      [email protected]
发送至     "[email protected]" <[email protected]>,
erlang-china <[email protected]>
日期      2010年10月27日 下午4:20
主题      [erlang-china:3488] 关于xmerl_scan:string/2 方法 对汉字的处理

请问谁用用过xmerl_scan:string/2 方法,怎么能处理汉字 如下面 例子

6> xmerl_scan:string("<name>langzhe</name>",[{encoding, 'utf-8'}]).
{{xmlElement,name,name,[],
             {xmlNamespace,[],[]},
             [],1,[],
             [{xmlText,[{name,1}],1,[],"langzhe",text}],
             [],"/home/jason",undeclared},
 []}
373737> xmerl_scan:string("<name>郎哲</name>",[{encoding, 'utf-8'}]).   
3266- fatal: {error,{wfc_Legal_Character,{error,{bad_character,37070}} }}
** exception exit: {fatal,{{error,{wfc_Legal_Character,{error,{bad_character,37070}} }},
                           {file,file_name_unknown},
                           {line,1},
                           {col,9
  • in function xmerl_scan:fatal/2 in call from xmerl_scan:scan_char_data/5 in call from xmerl_scan:scan_content/11 in call from xmerl_scan:scan_element/12 in call from xmerl_scan:scan_document/2 in call from xmerl_scan:string/2

}}}

尝试 UTF-8

环境
  • ubuntu10.04
  • Erlang R13B03 (erts-5.7.4) [source] [smp:2:2] [rq:2] [async-threads:0] [kernel-poll:false]

Eshell V5.7.4  (abort with ^G)
1> testxmerl:test().
{{xmlElement,name,name,[],
             {xmlNamespace,[],[]},
             [],1,[],
             [{xmlText,[{name,1}],1,[],[37070,21746],text}],
------- 这个地方转换成 list 不是我想要的结果 ,我想要的结果还应该是“郎哲”
             [],"/home/jason/learn",undeclared},
 []}
2> testxmerl:test("郎哲").
3266- fatal: {error
,{wfc_Legal_Character,{error,{bad_character,37070}} }}
** exception exit: {fatal,
                       {{error,
                            {wfc_Legal_Character,
                                {error,{bad_character,37070}} }},
                        {file,file_name_unknown},
                        {line,1},
                        {col,9
  • in function xmerl_scan:fatal/2 in call from xmerl_scan:scan_char_data/5 in call from xmerl_scan:scan_content/11 in call from xmerl_scan:scan_element/12 in call from xmerl_scan:scan_document/2 in call from xmerl_scan:string/2

}}}

----源测试代码---------------

#erlang
-module(testxmerl).
-export([test/0,test/1]).

test()->
 Str1 ="郎哲",
 xmerl_scan:string("<name>"++Str1++"</name>",[{encoding, 'utf-8'}]).

test(Str)->
 xmerl_scan:string("<name>"++Str++"</name>",[{encoding, 'utf-8'}]).

unicode

昨天用xmerl_scan:string/1 解析xml,发现xmerl_scan:string/1返回的是unicode,如中国的“中”,经过xmerl_scan:string/1处理后,返回的是"[10023]".继续传递的时候到了ejabberd_odbc.erl中,就出错了。因为他没有做处理。

解决此问题有两种方法

  • 1.修改 程序代码 对xmerl_scan:string/1返回值进行编码
  • 2.修改ejabberd_odbc.erl对传入的参数进行编码

   1 Index: odbc/ejabberd_odbc.erl
   2 ===================================================================
   3 --- odbc/ejabberd_odbc.erl (版本 1)
   4 +++ odbc/ejabberd_odbc.erl (版本 2)
   5 @@ -150,7 +150,11 @@
   6  
   7  %% Escape character that will confuse an SQL engine
   8  escape(S) when is_list(S) ->
   9 -    [odbc_queries:escape(C) || C <- S].
  10 +    %% to handle unicode codepoint to utf-8 string
  11 +    UTF8Binary = unicode:characters_to_binary(S, utf8, utf8),
  12 +    UTF8Str = binary_to_list(UTF8Binary),
  13 +    [odbc_queries:escape(C) || C <- UTF8Str].
  14 


反馈

创建 by -- ZoomQuiet [2010-11-06 08:08:10]

MiscItems/2010-11-06 (last edited 2010-11-06 08:15:22 by ZoomQuiet)