Skip to content

Commit 7f76d51

Browse files
committedDec 14, 2017
extract encoding data from object files
1 parent 62fc63f commit 7f76d51

File tree

1 file changed

+113
-0
lines changed

1 file changed

+113
-0
lines changed
 

‎scripts/generate.rb

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# coding: utf-8
2+
3+
REPO_PATH = ARGV.first || '/usr/src/ruby-2.4.2' # path to ruby repo
4+
SECTION_NAME = "rdata"
5+
UNICODE_VERSION = "9.0.0"
6+
SRC_DIR = "../src/org/jcodings"
7+
DST_BIN_DIR = "../resources/tables"
8+
INDENT = " " * 4
9+
10+
def generate_data
11+
generate_transoder_data
12+
generate_coderange_data
13+
genrate_coderange_list
14+
# genrate_fold_data
15+
end
16+
17+
def process_binary obj_name
18+
binary = open(obj_name, "rb"){|f|f.read}
19+
offset = `objdump -h -j .#{SECTION_NAME} #{obj_name}`[/\.#{SECTION_NAME}.*?(\w+)\s+\S+$/, 1].to_i(16)
20+
`nm --no-sort --defined-only #{obj_name}`.split("\n").map{|s|s.split(/\s+/)}.each do |address, _, name|
21+
yield name, binary, address.to_i(16) + offset
22+
end
23+
end
24+
25+
def generate_transoder_data
26+
Dir["#{REPO_PATH}/enc/trans/*.c"].reject{|f| f =~ /transdb/}.each do |trans_file|
27+
# next unless trans_file =~ /utf8/
28+
trans_file = trans_file[/(.*)\./, 1]
29+
src = open("#{trans_file}.c", "rb").read
30+
process_binary "#{trans_file}.o"do |name, binary, address|
31+
case name
32+
when /(.*)_byte_array/
33+
name = $1
34+
size = src[/(\w+?_byte_array)\[(\d+?)\]/m, 2].to_i
35+
open("#{DST_BIN_DIR}/" + "Transcoder_#{name.capitalize.tr('_', '')}_ByteArray.bin", "wb") do |f|
36+
f << [size].pack("N")
37+
f << binary[address, size]
38+
end
39+
when /(.*)_word_array/
40+
name = $1
41+
size = src[/(\w+?_word_array)\[(\d+?)\]/m, 2].to_i
42+
open("#{DST_BIN_DIR}/" + "Transcoder_#{name.capitalize.tr('_', '')}_WordArray.bin", "wb") do |f|
43+
f << [size].pack("N")
44+
address.step(address + (size * 4 - 1), 4).each do |adr|
45+
f << binary[adr, 4].unpack("l").pack("N")
46+
end
47+
end
48+
end
49+
end
50+
end
51+
end
52+
53+
def generate_coderange_data
54+
process_binary "#{REPO_PATH}/enc/unicode.o" do |name, binary, address|
55+
case name
56+
when /CR_(.*)/
57+
size = binary[address, 4].unpack("l")
58+
address += 4
59+
open("#{DST_BIN_DIR}/#{name}.bin", "wb") do |f|
60+
f << [size[0] * 2 + 1].pack("N")
61+
f << size.pack("N")
62+
63+
address.step(address + (size[0] * 2 * 4 - 1), 4).each do |adr|
64+
f << binary[adr, 4].unpack("l").pack("N")
65+
end
66+
end
67+
end
68+
end
69+
end
70+
71+
def genrate_coderange_list
72+
name2ctype_h = "#{REPO_PATH}/enc/unicode/#{UNICODE_VERSION}/name2ctype.h"
73+
cr_map = open("#{name2ctype_h}", "rb"){|f|f.read}.scan(/#define CR_(.*?) CR_(.*)/).inject({}){|h, (k, v)|h[v] = k.tr('_', '').downcase; h}
74+
unicode_src = `cpp #{name2ctype_h} -DUSE_UNICODE_PROPERTIES -DUSE_UNICODE_AGE_PROPERTIES | grep "^[^#;]"`
75+
gperf_map = Hash[unicode_src[/struct\s+uniname2ctype_pool_t\s+\{(.*?)\}\;/m, 1].scan(/uniname2ctype_pool_str(\d+).*\"(\S+)\"/)]
76+
77+
aliases = unicode_src[/wordlist\[\]\s+=\s+\{(.*?)\}\;/m, 1].scan(/uniname2ctype_pool_str(\d+).*?(\d+)/).
78+
inject(Hash.new{|h, k|h[k] = []}){|h, (name, num)|h[num] << gperf_map[name]; h}.inject({}) do |h, (k, v)|
79+
h.merge! Hash[v.map{|val|[val, v - [val]]}]
80+
h
81+
end
82+
83+
ranges = unicode_src[/CodeRanges\[\]\s+=\s+\{(.*?)\}\;/m, 1].scan(/CR_(\w+)/).flatten
84+
out = ranges.map do |range|
85+
name = range =~ /Age_(\d)_(\d)/ ? "age=#{$1}.#{$2}" : range.tr('_', '').downcase
86+
name = cr_map.delete(range) || name
87+
88+
([name] + aliases[name].to_a).map{|n|[n, range]}
89+
end.flatten(1)
90+
91+
open("#{SRC_DIR}/unicode/YUnicodeProperties.java", "wb") do |f| f <<
92+
open("UnicodePropertiesTemplate.java", "rb").read.
93+
sub(/%\{stdcrs\}/, "#{INDENT * 4}null").
94+
sub(/%\{extcrs\}/, out.map{|name, table| "#{INDENT * 4}" + "new CodeRangeEntry(\"#{name}\", \"CR_#{table}\")"}.join(",\n"))
95+
end
96+
end
97+
98+
def genrate_fold_data
99+
process_binary "#{REPO_PATH}/enc/unicode.o" do |name, binary, address|
100+
case name
101+
when /CaseFold_11_Table/
102+
103+
when /CaseUnfold_(\d+)_Table/
104+
case $1
105+
when '11'
106+
when '12'
107+
when '13'
108+
end
109+
end
110+
end
111+
end
112+
113+
generate_data

0 commit comments

Comments
 (0)
Please sign in to comment.