##language:zh #pragma section-numbers on ''' 含有章节索引的 *PUG 文章通用模板 ''' ::-- ehu4ever [<>] <> ## 默许导航,请保留 <> = 关于Python字符编码的一些经验 = ''这几天为unicode、cp936、gb18030这些个东东搞得身心憔悴,也总算是有了些经验。今天小结一下。'' == 处理windows中的unicode文件名 == * 这里所谓的unicode文件名是指文件中包含了一些不在中文windows默认字符集gb18030之内的字符,比如{{{©}}}它占用了16个bit。 * 事情的起因是我想提取一个unicode文件名,在pygtk+的GUI是显示。因为pygtk的GUI上只能显示unicode,我最初的解决方法是在model层存储gb18030字符集的文件名,在view层显示的时候转换为unicode。可是这个copyright符号总是显示有问题。 * 一番研究之后发现,model层里的{{{©}}}已经变成了{{{\xa9}}}少了前一半,看来是取文件名的时候出错。 * 对于unicode、cp936、gb18030这些理论我不是很懂,但是我知道我的目标是取文件名的时候完整地取出{{{\xc2\xa9}}},也就是完整地取出一个unicode字符,这样在GUI上显示的时候就不会有问题了,而且也不需要别处的转换。 * 下面是两个示例程序: === 有关情况说明 === {{{ >>> import os,sys >>> sys.stdout.encoding 'cp936' >>> sys.stdin.encoding 'cp936' >>> dir = 'e:\\PythonSpace\\copyright' >>> os.listdir(dir) ['John Wiley & Sons ? 2004 - John Wiley And Sons Professional Jakarta Struts.txt', 'winunichanges', 'winunichanges.zip'] >>> >>> dir = u'e:\\PythonSpace\\copyright' >>> print os.listdir(dir) [u'John Wiley & Sons \xa9 2004 - John Wiley And Sons Professional Jakarta Struts.txt', u'winunichan ges', u'winunichanges.zip'] >>> os.listdir(dir) [u'John Wiley & Sons \xa9 2004 - John Wiley And Sons Professional Jakarta Struts.txt', u'winunichan ges', u'winunichanges.zip'] >>> print [f.encode('utf8') for f in os.listdir(dir)] ['John Wiley & Sons \xc2\xa9 2004 - John Wiley And Sons Professional Jakarta Struts.txt', 'winunich anges', 'winunichanges.zip'] >>> print '\n'.join([f.encode('utf8') for f in os.listdir(dir)]) John Wiley & Sons 漏 2004 - John Wiley And Sons Professional Jakarta Struts.txt winunichanges winunichanges.zip >>> }}} === 示例1 === {{{ #!python import pygtk pygtk.require('2.0') import gtk, gobject tasks = { "Buy groceries": "Go to Asda after work", "Do some programming": "Remember to update your software", "Power up systems": "Turn on the client but leave the server", "Watch some tv": "Remember to catch ER", "copyright": "\xc2\xa9" } class GUI_Controller: def __init__(self): self.root = gtk.Window() self.root.set_title('CellRenderder Example') self.root.connect('destroy', lambda w:gtk.main_quit()) self.mdl = Store.get_model() self.view = Display.make_view(self.mdl) self.root.add(self.view) self.root.show_all() return def run(self): gtk.main() return class InfoModel: def __init__(self): self.tree_store = gtk.TreeStore(gobject.TYPE_STRING, gobject.TYPE_BOOLEAN) for item in tasks.keys(): parent = self.tree_store.append(None, (item, None)) self.tree_store.append(parent, (tasks[item], None)) return def get_model(self): if self.tree_store: return self.tree_store else: return None class DisplayModel: def make_view(self, model): self.view = gtk.TreeView(model) self.renderer = gtk.CellRendererText() self.renderer.set_property('editable', True) self.renderer.connect('edited', self.col0_edited_cb, model) self.renderer1 = gtk.CellRendererToggle() self.renderer1.set_property('activatable', True) self.renderer1.connect('toggled', self.col1_toggled_cb, model) self.col0 = gtk.TreeViewColumn('Name', self.renderer, text=0) self.col1 = gtk.TreeViewColumn('Complete', self.renderer1) self.col1.add_attribute(self.renderer1, 'active', 1) self.view.append_column(self.col0) self.view.append_column(self.col1) return self.view def col0_edited_cb(self, cell, path, new_text, model): print "change '%s' to '%s'" % (model[path][0], new_text.encode('cp936')) model[path][0] = new_text return def col1_toggled_cb(self, cell, path, model): model[path][1] = not model[path][1] print "toggle '%s' to: '%s'" % (model[path][0], model[path][1]) return if __name__ == '__main__': Store = InfoModel() Display = DisplayModel() myGUI = GUI_Controller() myGUI.run() }}} === 示例2 === {{{ #!python import os, stat, time, sys import pygtk pygtk.require('2.0') import gtk folderxpm = [ "17 16 7 1", " c #000000", ". c #808000", "X c yellow", "o c #808080", "O c #c0c0c0", "+ c white", "@ c None", "@@@@@@@@@@@@@@@@@", "@@@@@@@@@@@@@@@@@", "@@+XXXX.@@@@@@@@@", "@+OOOOOO.@@@@@@@@", "@+OXOXOXOXOXOXO. ", "@+XOXOXOXOXOXOX. ", "@+OXOXOXOXOXOXO. ", "@+XOXOXOXOXOXOX. ", "@+OXOXOXOXOXOXO. ", "@+XOXOXOXOXOXOX. ", "@+OXOXOXOXOXOXO. ", "@+XOXOXOXOXOXOX. ", "@+OOOOOOOOOOOOO. ", "@ ", "@@@@@@@@@@@@@@@@@", "@@@@@@@@@@@@@@@@@" ] folderpb = gtk.gdk.pixbuf_new_from_xpm_data(folderxpm) filexpm = [ "12 12 3 1", " c #000000", ". c #ffff04", "X c #b2c0dc", "X XXX", "X ...... XXX", "X ...... X", "X . ... X", "X ........ X", "X . .... X", "X ........ X", "X . .. X", "X ........ X", "X . .. X", "X ........ X", "X X" ] filepb = gtk.gdk.pixbuf_new_from_xpm_data(filexpm) class FileListingCellDataExample: column_names = ['Name', 'Size', 'Mode', 'Last Changed'] def __init__(self, dname=None): cell_data_funcs = (None, self.file_size, self.file_mode, self.file_last_changed) window = gtk.Window() window.set_size_request(400, 300) window.connect('destroy', lambda w: gtk.main_quit()) self.window = window listmodel = self.make_list(dname) self.treeview = gtk.TreeView() self.tvclm = [None]*len(self.column_names) cellpb = gtk.CellRendererPixbuf() self.tvclm[0] = gtk.TreeViewColumn(self.column_names[0], cellpb) self.tvclm[0].set_cell_data_func(cellpb, self.file_pixbuf) cell = gtk.CellRendererText() self.tvclm[0].pack_start(cell, False) self.tvclm[0].set_cell_data_func(cell, self.file_name) self.treeview.append_column(self.tvclm[0]) for n in range(1, len(self.column_names)): cell = gtk.CellRendererText() self.tvclm[n] = gtk.TreeViewColumn(self.column_names[n], cell) if n == 1: cell.set_property('xalign', 1.0) self.tvclm[n].set_cell_data_func(cell, cell_data_funcs[n]) self.treeview.append_column(self.tvclm[n]) self.treeview.connect('row-activated', self.open_file) self.scrolledwindow = gtk.ScrolledWindow() self.scrolledwindow.add(self.treeview) self.window.add(self.scrolledwindow) self.treeview.set_model(listmodel) self.window.show_all() return def make_list(self, dname=None): if not dname: # self.dirname = os.path.expanduser('~') self.dirname = u'e:\\' else: self.dirname = os.path.abspath(dname) self.window.set_title(self.dirname) files = [f for f in os.listdir(self.dirname) if f[0] <> '.'] files.sort() files= ['..'] + files listmodel = gtk.ListStore(object) for f in files: listmodel.append([f]) return listmodel def open_file(self, treeview, path, column): model = treeview.get_model() iter = model.get_iter(path) filename = os.path.join(self.dirname, model.get_value(iter, 0)) filestat = os.stat(filename) if stat.S_ISDIR(filestat.st_mode): new_model = self.make_list(filename) treeview.set_model(new_model) return def file_pixbuf(self, column, cell, model, iter): filename = os.path.join(self.dirname, model.get_value(iter, 0)) filestat = os.stat(filename) if stat.S_ISDIR(filestat.st_mode): pb = folderpb else: pb = filepb cell.set_property('pixbuf', pb) return def file_name(self,column, cell, model, iter): # cell.set_property('text', unicode(model.get_value(iter, 0), 'gb18030')) cell.set_property('text', model.get_value(iter, 0)) return def file_size(self, column, cell, model, iter): filename = os.path.join(self.dirname, model.get_value(iter, 0)) filestat = os.stat(filename) cell.set_property('text', filestat.st_size) return def file_mode(self, column, cell, model, iter): filename = os.path.join(self.dirname, model.get_value(iter, 0)) filestat = os.stat(filename) cell.set_property('text', oct(stat.S_IMODE(filestat.st_mode))) return def file_last_changed(self, column, cell, model, iter): filename = os.path.join(self.dirname, model.get_value(iter, 0)) filestat = os.stat(filename) cell.set_property('text', time.ctime(filestat.st_mtime)) return if __name__ == '__main__': FileListingCellDataExample() gtk.main() }}} == 希望有人用理论解释这些 == 我这里只是提出了一种方法,希望有朋友用理论来解释这一切。这样就可以知其然而知其所以然了。 * 我个人理解版权符号并不是gb码的一个,但它是unicode的一个符号。也就是说unicode是一个全集,有时我们看到的以为是汉字符号,但其实是不存在对应的汉字的,因此这种情况下转为汉字编码就不会成功。因此最好的方法就是使用utf-8。关于是不是有可以查汉字编码表的区间,如果不在这个区间内自然就不是汉字符号。我没有查过,但猜想是这个问题。 -- Limodou