2 Star 1 Fork 0

江西逐浪软件科技有限公司 / Z01FontElf

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
count_char.py 8.01 KB
一键复制 编辑 原始数据 按行查看 历史
cms发哥 提交于 2021-12-03 17:50 . 提交
from tkinter import *
from tkinter import ttk
from itertools import tee
import global_var
#special cjk compatibility for gbk, defined at end
global gbk_compatibility_deci_list
gbk_compatibility_deci_list = []
global cjk_compatibility_ideographs_deci_list
cjk_compatibility_ideographs_deci_list = []
#function prevent closing popup
def disable_event():
pass
#input:char_list iterable
# master to create popup
# lang to change popup lang
#output:tuple of number
def count_char(char_list, master, lang="en"):
#language localization
if lang == "zhs": #simplified chinese
toplevel_title = "精灵正在运行-请稍侯|计数中……"
elif lang == "zht": #traditional chinese
toplevel_title = "計數中……"
else:
toplevel_title = "Counting..."
#get font unicode list
lines_seen = []
unicode_char_count={}
#prepare unicode area count storage
for item in global_var.unicode_list:
unicode_char_count[item]=0
#prepare cjk encoding count storage, moved extracting text from txt to main code bottom
for encoding in global_var.cjk_list:
cjk_char_count[encoding]=0
#count total records
#copy iterable to be consumed
char_list2, counting_list = tee(char_list, 2)
total_record_count = sum(1 for _ in counting_list)
#popup progress bar
popup = Toplevel(master)
popup.title(toplevel_title)
popup.minsize(500,10)
popup.resizable(False, False)
#grab attention
popup.grab_set()
#prevent closing
#popup.overrideredirect(True) #hide window bar, not good
popup.protocol("WM_DELETE_WINDOW", disable_event)
#progress
progress = 0
progress_var = DoubleVar()
progress_bar = ttk.Progressbar(popup, orient="horizontal", length=500, variable=progress_var, mode='determinate', maximum=total_record_count)
progress_bar.grid(row=0, column=0)
progress_bar.pack()
#progress_bar = ttk.Progressbar(popup, orient="horizontal", length=500, mode='indeterminate')
#row[0] is unicode in decimal, row[1] is gid (could not use for CID mapping), row[2] is unicode name
for row in char_list2:
current_uni_dec_value = row[0]
#refresh popup
popup.update()
#check for duplicate as all `cmap` table are exported for all platform
if current_uni_dec_value not in lines_seen:
lines_seen.append(current_uni_dec_value)
#check range with base 10 unicode and count by range
range = uni_range_check(current_uni_dec_value)
#if character is in cjk range
if range:
#count unicode range
unicode_char_count[range]+=1
#compatibility but unified ideographs, use cjk_compatibility_ideographs_deci_list
if range == "compat" and current_uni_dec_value in cjk_compatibility_ideographs_deci_list:
unicode_char_count["compat-ideo"]+=1
#cjk encoding is only count if it is in cjk range of unicode
#get real character
char = chr(current_uni_dec_value)
#filter and count cjk
for encoding in global_var.cjk_list:
#gb18030 no file list, escape total
if encoding == "gb18030":
continue
#gbk no file list, however use CJK range in Unicode 1.0
if encoding == "gbk":
if current_uni_dec_value in char_range(deci("4E00"), deci("9FA5")) or current_uni_dec_value in gbk_compatibility_deci_list:
cjk_char_count[encoding]+=1
continue
#see if in cjk encoding
if char in cjk_dict[encoding]:
cjk_char_count[encoding]+=1
#increment progress
progress+=1
#update progress bar
progress_var.set(progress)
#if already saw, skip it
continue
#destroy popup and iterator since done
popup.destroy()
#gb18030 mandatory CJK Unified Ideographs and CJK Unified Ideographs Extension A
cjk_char_count["gb18030"]=unicode_char_count["basic"]+unicode_char_count["ext-a"]+unicode_char_count["zero"]
#sum up total cjk unified ideographs
unicode_char_count["total"] = unicode_char_count["zero"]+unicode_char_count["basic"]+unicode_char_count["ext-a"]+unicode_char_count["compat-ideo"]+unicode_char_count["ext-b"]+unicode_char_count["ext-c"]+unicode_char_count["ext-d"]+unicode_char_count["ext-e"]+unicode_char_count["ext-f"]+unicode_char_count["ext-g"]
#print(unicode_char_count["ext-g"])
return (cjk_char_count, unicode_char_count)
#load encoding file
def load_sample_file(filename):
font_list = []
for line in open(filename, "r", encoding="utf-8"):
font_list.append(line.strip("\r\n").strip(" "))
return font_list
#conversion to base 10, return 0 if failed
def deci(number):
try:
return int(number,16)
except:
return 0
# special check range function as python default range don't include ending number
def char_range(start, end):
return range(start, end+1)
# normal range: range(0,5) --> [0,1,2,3,4], len(range(0,5))=5
# character detect range: char_range(0,5) --> [0,1,2,3,4,5], len(char_range(0,5))=6
#check range of character:
def uni_range_check(char_base10):
#filter and count unicode
if char_base10 in char_range(deci("4E00"), deci("9FFF")): #4E00 - 9FFF CJK Unified Ideographs
return "basic"
elif char_base10 in char_range(deci("2F00"), deci("2FDF")): #2F00 — 2FDF Kangxi Radicals
return "kangxi"
elif char_base10 in char_range(deci("2E80"), deci("2EFF")): #2E80 — 2EFF CJK Radical Supplements
return "kangxi-sup"
elif char_base10 == 12295: # U+3007 Ideographic Number Zero Unicode Character
return "zero"
elif char_base10 in char_range(deci("3400"), deci("4DBF")): #3400 — 4DBF CJK Unified Ideographs Extension A
return "ext-a"
elif char_base10 in char_range(deci("F900"), deci("FAFF")): #F900 — FAFF CJK Compatibility Ideographs
return "compat"
elif char_base10 in char_range(deci("20000"), deci("2A6DF")): #20000 — 2A6DF CJK Unified Ideographs Extension B
return "ext-b"
elif char_base10 in char_range(deci("2A700"), deci("2B73F")): #2A700 – 2B73F CJK Unified Ideographs Extension C
return "ext-c"
elif char_base10 in char_range(deci("2B740"), deci("2B81F")): #2B740 – 2B81F CJK Unified Ideographs Extension D
return "ext-d"
elif char_base10 in char_range(deci("2B820"), deci("2CEAF")): #2B820 – 2CEAF CJK Unified Ideographs Extension E
return "ext-e"
elif char_base10 in char_range(deci("2CEB0"), deci("2EBEF")): #2CEB0 – 2EBEF CJK Unified Ideographs Extension F
return "ext-f"
elif char_base10 in char_range(deci("2F800"), deci("2FA1F")): #2F800 — 2FA1F CJK Compatibility Ideographs Supplement
return "compat-sup"
elif char_base10 in char_range(deci("30000"), deci("3134F")): #30000 - 3134F CJK Unified Ideographs Extension G
return "ext-g"
#definition of gbk_compatibility_deci_list and cjk_compatibility_ideographs_deci_list, require deci(number)
cjk_compatibility_list_hex=["FA0E", "FA0F", "FA11", "FA13", "FA14", "FA1F", "FA21", "FA23", "FA24", "FA27", "FA28", "FA29"] #﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧﨨﨩
gbk_compatibility_deci_list_hex=["F92C", "F979", "F995", "F9E7", "F9F1", "FA0C", "FA0D", "FA18", "FA20"] #郎凉秊裏隣兀嗀礼蘒
for item in cjk_compatibility_list_hex:
gbk_compatibility_deci_list.append(deci(item))
cjk_compatibility_ideographs_deci_list.append(deci(item))
for item in gbk_compatibility_deci_list_hex:
gbk_compatibility_deci_list.append(deci(item))
#get cjk encoding character list from txt files - will do when imported on start
cjk_dict = {}
cjk_char_count = {}
for encoding in global_var.cjk_list:
#gb18030 no file list, obsolete gbk file list
if encoding == "gb18030" or encoding == "gbk":
continue
cjk_dict[encoding] = load_sample_file(encoding+"-han.txt")
cjk_char_count[encoding]=0
Python
1
https://gitee.com/zoomla/Z01FontElf.git
git@gitee.com:zoomla/Z01FontElf.git
zoomla
Z01FontElf
Z01FontElf
main

搜索帮助