# Copyright (C) 2018 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html # Python 2/3 Compatibility (ICU-20299) # TODO(ICU-20301): Remove this. from __future__ import print_function from icutools.databuilder import * from icutools.databuilder import utils from icutools.databuilder.request_types import * import os import sys def generate(config, io, common_vars): requests = [] if len(io.glob("misc/*")) == 0: print("Error: Cannot find data directory; please specify --src_dir", file=sys.stderr) exit(1) requests += generate_cnvalias(config, io, common_vars) requests += generate_ulayout(config, io, common_vars) requests += generate_uemoji(config, io, common_vars) requests += generate_confusables(config, io, common_vars) requests += generate_conversion_mappings(config, io, common_vars) requests += generate_brkitr_brk(config, io, common_vars) requests += generate_brkitr_lstm(config, io, common_vars) requests += generate_brkitr_adaboost(config, io, common_vars) requests += generate_stringprep(config, io, common_vars) requests += generate_brkitr_dictionaries(config, io, common_vars) requests += generate_normalization(config, io, common_vars) requests += generate_coll_ucadata(config, io, common_vars) requests += generate_full_unicore_data(config, io, common_vars) requests += generate_unames(config, io, common_vars) requests += generate_misc(config, io, common_vars) requests += generate_curr_supplemental(config, io, common_vars) requests += generate_zone_supplemental(config, io, common_vars) requests += generate_translit(config, io, common_vars) # Res Tree Files # (input dirname, output dirname, resfiles.mk path, mk version var, mk source var, use pool file, dep files) requests += generate_tree(config, io, common_vars, "locales", None, config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "curr", "curr", config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "lang", "lang", config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "region", "region", config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "zone", "zone", config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "unit", "unit", config.use_pool_bundle, []) requests += generate_tree(config, io, common_vars, "coll", "coll", # Never use pool bundle for coll, brkitr, or rbnf False, # Depends on timezoneTypes.res and keyTypeData.res. # TODO: We should not need this dependency to build collation. # TODO: Bake keyTypeData.res into the common library? [DepTarget("coll_ucadata"), DepTarget("misc_res"), InFile("unidata/UCARules.txt")]) requests += generate_tree(config, io, common_vars, "brkitr", "brkitr", # Never use pool bundle for coll, brkitr, or rbnf False, [DepTarget("brkitr_brk"), DepTarget("dictionaries")]) requests += generate_tree(config, io, common_vars, "rbnf", "rbnf", # Never use pool bundle for coll, brkitr, or rbnf False, []) requests += [ ListRequest( name = "icudata_list", variable_name = "icudata_all_output_files", output_file = TmpFile("icudata.lst"), include_tmp = False ) ] return requests def generate_cnvalias(config, io, common_vars): # UConv Name Aliases input_file = InFile("mappings/convrtrs.txt") output_file = OutFile("cnvalias.icu") return [ SingleExecutionRequest( name = "cnvalias", category = "cnvalias", dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("gencnval"), args = "-s {IN_DIR} -d {OUT_DIR} " "{INPUT_FILES[0]}", format_with = {} ) ] def generate_confusables(config, io, common_vars): # CONFUSABLES txt1 = InFile("unidata/confusables.txt") txt2 = InFile("unidata/confusablesWholeScript.txt") cfu = OutFile("confusables.cfu") return [ SingleExecutionRequest( name = "confusables", category = "confusables", dep_targets = [DepTarget("cnvalias")], input_files = [txt1, txt2], output_files = [cfu], tool = IcuTool("gencfu"), args = "-d {OUT_DIR} -i {OUT_DIR} " "-c -r {IN_DIR}/{INPUT_FILES[0]} -w {IN_DIR}/{INPUT_FILES[1]} " "-o {OUTPUT_FILES[0]}", format_with = {} ) ] def generate_conversion_mappings(config, io, common_vars): # UConv Conversion Table Files input_files = [InFile(filename) for filename in io.glob("mappings/*.ucm")] output_files = [OutFile("%s.cnv" % v.filename[9:-4]) for v in input_files] # TODO: handle BUILD_SPECIAL_CNV_FILES? Means to add --ignore-siso-check flag to makeconv return [ RepeatedOrSingleExecutionRequest( name = "conversion_mappings", category = "conversion_mappings", dep_targets = [], input_files = input_files, output_files = output_files, tool = IcuTool("makeconv"), args = "-s {IN_DIR} -d {OUT_DIR} -c {INPUT_FILE_PLACEHOLDER}", format_with = {}, repeat_with = { "INPUT_FILE_PLACEHOLDER": utils.SpaceSeparatedList(file.filename for file in input_files) } ) ] def generate_brkitr_brk(config, io, common_vars): # BRK Files input_files = [InFile(filename) for filename in io.glob("brkitr/rules/*.txt")] output_files = [OutFile("brkitr/%s.brk" % v.filename[13:-4]) for v in input_files] return [ RepeatedExecutionRequest( name = "brkitr_brk", category = "brkitr_rules", dep_targets = [DepTarget("cnvalias"), DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")], input_files = input_files, output_files = output_files, tool = IcuTool("genbrk"), args = "-d {OUT_DIR} -i {OUT_DIR} " "-c -r {IN_DIR}/{INPUT_FILE} " "-o {OUTPUT_FILE}", format_with = {}, repeat_with = {} ) ] def generate_stringprep(config, io, common_vars): # SPP FILES input_files = [InFile(filename) for filename in io.glob("sprep/*.txt")] output_files = [OutFile("%s.spp" % v.filename[6:-4]) for v in input_files] bundle_names = [v.filename[6:-4] for v in input_files] return [ RepeatedExecutionRequest( name = "stringprep", category = "stringprep", dep_targets = [InFile("unidata/NormalizationCorrections.txt")], input_files = input_files, output_files = output_files, tool = IcuTool("gensprep"), args = "-s {IN_DIR}/sprep -d {OUT_DIR} -i {OUT_DIR} " "-b {BUNDLE_NAME} -m {IN_DIR}/unidata -u 3.2.0 {BUNDLE_NAME}.txt", format_with = {}, repeat_with = { "BUNDLE_NAME": bundle_names } ) ] def generate_brkitr_dictionaries(config, io, common_vars): # Dict Files input_files = [InFile(filename) for filename in io.glob("brkitr/dictionaries/*.txt")] output_files = [OutFile("brkitr/%s.dict" % v.filename[20:-4]) for v in input_files] extra_options_map = { "brkitr/dictionaries/burmesedict.txt": "--bytes --transform offset-0x1000", "brkitr/dictionaries/cjdict.txt": "--uchars", "brkitr/dictionaries/khmerdict.txt": "--bytes --transform offset-0x1780", "brkitr/dictionaries/laodict.txt": "--bytes --transform offset-0x0e80", "brkitr/dictionaries/thaidict.txt": "--bytes --transform offset-0x0e00" } extra_optionses = [extra_options_map[v.filename] for v in input_files] return [ RepeatedExecutionRequest( name = "dictionaries", category = "brkitr_dictionaries", dep_targets = [], input_files = input_files, output_files = output_files, tool = IcuTool("gendict"), args = "-i {OUT_DIR} " "-c {EXTRA_OPTIONS} " "{IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", format_with = {}, repeat_with = { "EXTRA_OPTIONS": extra_optionses } ) ] def generate_normalization(config, io, common_vars): # NRM Files input_files = [InFile(filename) for filename in io.glob("in/*.nrm")] # nfc.nrm is pre-compiled into C++; see generate_full_unicore_data input_files.remove(InFile("in/nfc.nrm")) output_files = [OutFile(v.filename[3:]) for v in input_files] return [ RepeatedExecutionRequest( name = "normalization", category = "normalization", dep_targets = [], input_files = input_files, output_files = output_files, tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}", format_with = {}, repeat_with = {} ) ] def generate_coll_ucadata(config, io, common_vars): # Collation Dependency File (ucadata.icu) input_file = InFile("in/coll/ucadata-%s.icu" % config.coll_han_type) output_file = OutFile("coll/ucadata.icu") return [ SingleExecutionRequest( name = "coll_ucadata", category = "coll_ucadata", dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", format_with = {} ) ] def generate_full_unicore_data(config, io, common_vars): # The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu) # are hardcoded in the common DLL and therefore not included in the data package any more. # They are not built by default but need to be built for ICU4J data, # both in the .jar and in the .dat file (if ICU4J uses the .dat file). # See ICU-4497. if not config.include_uni_core_data: return [] basenames = [ "pnames.icu", "uprops.icu", "ucase.icu", "ubidi.icu", "nfc.nrm" ] input_files = [InFile("in/%s" % bn) for bn in basenames] output_files = [OutFile(bn) for bn in basenames] return [ RepeatedExecutionRequest( name = "unicore", category = "unicore", input_files = input_files, output_files = output_files, tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILE} {OUT_DIR}/{OUTPUT_FILE}" ) ] def generate_unames(config, io, common_vars): # Unicode Character Names input_file = InFile("in/unames.icu") output_file = OutFile("unames.icu") return [ SingleExecutionRequest( name = "unames", category = "unames", dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", format_with = {} ) ] def generate_ulayout(config, io, common_vars): # Unicode text layout properties basename = "ulayout" input_file = InFile("in/%s.icu" % basename) output_file = OutFile("%s.icu" % basename) return [ SingleExecutionRequest( name = basename, category = basename, dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", format_with = {} ) ] def generate_uemoji(config, io, common_vars): # Unicode emoji properties basename = "uemoji" input_file = InFile("in/%s.icu" % basename) output_file = OutFile("%s.icu" % basename) return [ SingleExecutionRequest( name = basename, category = basename, dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("icupkg"), args = "-t{ICUDATA_CHAR} {IN_DIR}/{INPUT_FILES[0]} {OUT_DIR}/{OUTPUT_FILES[0]}", format_with = {} ) ] def generate_misc(config, io, common_vars): # Misc Data Res Files input_files = [InFile(filename) for filename in io.glob("misc/*.txt")] input_basenames = [v.filename[5:] for v in input_files] output_files = [OutFile("%s.res" % v[:-4]) for v in input_basenames] return [ RepeatedExecutionRequest( name = "misc_res", category = "misc", dep_targets = [DepTarget("cnvalias")], # ICU-21175 input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/misc -d {OUT_DIR} -i {OUT_DIR} " "-k -q " "{INPUT_BASENAME}", format_with = {}, repeat_with = { "INPUT_BASENAME": input_basenames } ) ] def generate_curr_supplemental(config, io, common_vars): # Currency Supplemental Res File input_file = InFile("curr/supplementalData.txt") input_basename = "supplementalData.txt" output_file = OutFile("curr/supplementalData.res") return [ SingleExecutionRequest( name = "curr_supplemental_res", category = "curr_supplemental", dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("genrb"), args = "-s {IN_DIR}/curr -d {OUT_DIR}/curr -i {OUT_DIR} " "-k " "{INPUT_BASENAME}", format_with = { "INPUT_BASENAME": input_basename } ) ] def generate_zone_supplemental(config, io, common_vars): # tzdbNames Res File input_file = InFile("zone/tzdbNames.txt") input_basename = "tzdbNames.txt" output_file = OutFile("zone/tzdbNames.res") return [ SingleExecutionRequest( name = "zone_supplemental_res", category = "zone_supplemental", dep_targets = [], input_files = [input_file], output_files = [output_file], tool = IcuTool("genrb"), args = "-s {IN_DIR}/zone -d {OUT_DIR}/zone -i {OUT_DIR} " "-k " "{INPUT_BASENAME}", format_with = { "INPUT_BASENAME": input_basename } ) ] def generate_translit(config, io, common_vars): input_files = [ InFile("translit/root.txt"), InFile("translit/en.txt"), InFile("translit/el.txt") ] dep_files = set(InFile(filename) for filename in io.glob("translit/*.txt")) dep_files -= set(input_files) dep_files = list(sorted(dep_files)) input_basenames = [v.filename[9:] for v in input_files] output_files = [ OutFile("translit/%s.res" % v[:-4]) for v in input_basenames ] return [ RepeatedOrSingleExecutionRequest( name = "translit_res", category = "translit", dep_targets = dep_files, input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/translit -d {OUT_DIR}/translit -i {OUT_DIR} " "-k " "{INPUT_BASENAME}", format_with = { }, repeat_with = { "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) } ) ] def generate_brkitr_lstm(config, io, common_vars): input_files = [InFile(filename) for filename in io.glob("brkitr/lstm/*.txt")] input_basenames = [v.filename[12:] for v in input_files] output_files = [ OutFile("brkitr/%s.res" % v[:-4]) for v in input_basenames ] return [ RepeatedOrSingleExecutionRequest( name = "lstm_res", category = "brkitr_lstm", dep_targets = [], input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/brkitr/lstm -d {OUT_DIR}/brkitr -i {OUT_DIR} " "-k " "{INPUT_BASENAME}", format_with = { }, repeat_with = { "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) } ) ] def generate_brkitr_adaboost(config, io, common_vars): input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")] input_basenames = [v.filename[16:] for v in input_files] output_files = [ OutFile("brkitr/%s.res" % v[:-4]) for v in input_basenames ] return [ RepeatedOrSingleExecutionRequest( name = "adaboost_res", category = "brkitr_adaboost", dep_targets = [], input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} " "-k " "{INPUT_BASENAME}", format_with = { }, repeat_with = { "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) } ) ] def generate_tree( config, io, common_vars, sub_dir, out_sub_dir, use_pool_bundle, dep_targets): requests = [] category = "%s_tree" % sub_dir out_prefix = "%s/" % out_sub_dir if out_sub_dir else "" input_files = [InFile(filename) for filename in io.glob("%s/*.txt" % sub_dir)] if sub_dir == "curr": input_files.remove(InFile("curr/supplementalData.txt")) if sub_dir == "zone": input_files.remove(InFile("zone/tzdbNames.txt")) input_basenames = [v.filename[len(sub_dir)+1:] for v in input_files] output_files = [ OutFile("%s%s.res" % (out_prefix, v[:-4])) for v in input_basenames ] # Generate Pool Bundle if use_pool_bundle: input_pool_files = [OutFile("%spool.res" % out_prefix)] pool_target_name = "%s_pool_write" % sub_dir use_pool_bundle_option = "--usePoolBundle {OUT_DIR}/{OUT_PREFIX}".format( OUT_PREFIX = out_prefix, **common_vars ) requests += [ SingleExecutionRequest( name = pool_target_name, category = category, dep_targets = dep_targets, input_files = input_files, output_files = input_pool_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " "--writePoolBundle -k " "{INPUT_BASENAMES_SPACED}", format_with = { "IN_SUB_DIR": sub_dir, "OUT_PREFIX": out_prefix, "INPUT_BASENAMES_SPACED": utils.SpaceSeparatedList(input_basenames) } ), ] dep_targets = dep_targets + [DepTarget(pool_target_name)] else: use_pool_bundle_option = "" # Generate Res File Tree requests += [ RepeatedOrSingleExecutionRequest( name = "%s_res" % sub_dir, category = category, dep_targets = dep_targets, input_files = input_files, output_files = output_files, tool = IcuTool("genrb"), args = "-s {IN_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " "{EXTRA_OPTION} -k " "{INPUT_BASENAME}", format_with = { "IN_SUB_DIR": sub_dir, "OUT_PREFIX": out_prefix, "EXTRA_OPTION": use_pool_bundle_option }, repeat_with = { "INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames) } ) ] # Generate res_index file # Exclude the deprecated locale variants and root; see ICU-20628. This # could be data-driven, but we do not want to perform I/O in this script # (for example, we do not want to read from an XML file). excluded_locales = set([ "ja_JP_TRADITIONAL", "th_TH_TRADITIONAL", "de_", "de__PHONEBOOK", "es_", "es__TRADITIONAL", "root", ]) # Put alias locales in a separate structure; see ICU-20627 dependency_data = io.read_locale_deps(sub_dir) if "aliases" in dependency_data: alias_locales = set(dependency_data["aliases"].keys()) else: alias_locales = set() alias_files = [] installed_files = [] for f in input_files: file_stem = IndexRequest.locale_file_stem(f) if file_stem in excluded_locales: continue destination = alias_files if file_stem in alias_locales else installed_files destination.append(f) cldr_version = dependency_data["cldrVersion"] if sub_dir == "locales" else None index_file_txt = TmpFile("{IN_SUB_DIR}/{INDEX_NAME}.txt".format( IN_SUB_DIR = sub_dir, **common_vars )) index_res_file = OutFile("{OUT_PREFIX}{INDEX_NAME}.res".format( OUT_PREFIX = out_prefix, **common_vars )) index_file_target_name = "%s_index_txt" % sub_dir requests += [ IndexRequest( name = index_file_target_name, category = category, installed_files = installed_files, alias_files = alias_files, txt_file = index_file_txt, output_file = index_res_file, cldr_version = cldr_version, args = "-s {TMP_DIR}/{IN_SUB_DIR} -d {OUT_DIR}/{OUT_PREFIX} -i {OUT_DIR} " "-k " "{INDEX_NAME}.txt", format_with = { "IN_SUB_DIR": sub_dir, "OUT_PREFIX": out_prefix } ) ] return requests