From: Nicolas Pitre Date: Thu, 10 Apr 2025 01:14:01 +0000 (-0400) Subject: vt: update gen_ucs_width.py to produce more space efficient tables X-Git-Tag: v6.16-rc1~29^2~80 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=119ff0b0f4541972d829da606599441dace2444d;p=linux-block.git vt: update gen_ucs_width.py to produce more space efficient tables Split table ranges into BMP (16-bit) and non-BMP (above 16-bit). This reduces the corresponding text size by 20-25%. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250410011839.64418-10-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- diff --git a/drivers/tty/vt/gen_ucs_width.py b/drivers/tty/vt/gen_ucs_width.py index 41997fe00129..c6cbc93e83f2 100755 --- a/drivers/tty/vt/gen_ucs_width.py +++ b/drivers/tty/vt/gen_ucs_width.py @@ -132,13 +132,49 @@ def generate_ucs_width(): ranges.append((start, prev)) return ranges + # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit) + def split_ranges_by_size(ranges): + bmp_ranges = [] + non_bmp_ranges = [] + + for start, end in ranges: + if end <= 0xFFFF: + bmp_ranges.append((start, end)) + elif start > 0xFFFF: + non_bmp_ranges.append((start, end)) + else: + # Split the range at 0xFFFF + bmp_ranges.append((start, 0xFFFF)) + non_bmp_ranges.append((0x10000, end)) + + return bmp_ranges, non_bmp_ranges + # Extract ranges for each width zero_width_ranges = ranges_optimize(width_map, 0) double_width_ranges = ranges_optimize(width_map, 2) + # Split ranges into BMP and non-BMP + zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges) + double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges) + # Get Unicode version information unicode_version = unicodedata.unidata_version + # Function to generate code point description comments + def get_code_point_comment(start, end): + try: + start_char_desc = unicodedata.name(chr(start)) + if start == end: + return f"/* {start_char_desc} */" + else: + end_char_desc = unicodedata.name(chr(end)) + return f"/* {start_char_desc} - {end_char_desc} */" + except: + if start == end: + return f"/* U+{start:04X} */" + else: + return f"/* U+{start:04X} - U+{end:04X} */" + # Generate C implementation file with open(c_file, 'w') as f: f.write(f"""\ @@ -156,62 +192,77 @@ def generate_ucs_width(): #include #include -struct interval {{ +struct interval16 {{ + uint16_t first; + uint16_t last; +}}; + +struct interval32 {{ uint32_t first; uint32_t last; }}; -/* Zero-width character ranges */ -static const struct interval zero_width_ranges[] = {{ +/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ +static const struct interval16 zero_width_bmp[] = {{ """) - for start, end in zero_width_ranges: - try: - start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}" - if start == end: - comment = f"/* {start_char_desc} */" - else: - end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}" - comment = f"/* {start_char_desc} - {end_char_desc} */" - except: - if start == end: - comment = f"/* U+{start:05X} */" - else: - comment = f"/* U+{start:05X} - U+{end:05X} */" + for start, end in zero_width_bmp: + comment = get_code_point_comment(start, end) + f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") + + f.write("""\ +}; +/* Zero-width character ranges (non-BMP, U+10000 and above) */ +static const struct interval32 zero_width_non_bmp[] = { +""") + + for start, end in zero_width_non_bmp: + comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") f.write("""\ }; -/* Double-width character ranges */ -static const struct interval double_width_ranges[] = { +/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */ +static const struct interval16 double_width_bmp[] = { """) - for start, end in double_width_ranges: - try: - start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}" - if start == end: - comment = f"/* {start_char_desc} */" - else: - end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}" - comment = f"/* {start_char_desc} - {end_char_desc} */" - except: - if start == end: - comment = f"/* U+{start:05X} */" - else: - comment = f"/* U+{start:05X} - U+{end:05X} */" + for start, end in double_width_bmp: + comment = get_code_point_comment(start, end) + f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n") + + f.write("""\ +}; +/* Double-width character ranges (non-BMP, U+10000 and above) */ +static const struct interval32 double_width_non_bmp[] = { +""") + + for start, end in double_width_non_bmp: + comment = get_code_point_comment(start, end) f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n") f.write("""\ }; -static int ucs_cmp(const void *key, const void *element) +static int ucs_cmp16(const void *key, const void *element) +{ + uint16_t cp = *(uint16_t *)key; + const struct interval16 *e = element; + + if (cp > e->last) + return 1; + if (cp < e->first) + return -1; + return 0; +} + +static int ucs_cmp32(const void *key, const void *element) { uint32_t cp = *(uint32_t *)key; - const struct interval *e = element; + const struct interval32 *e = element; if (cp > e->last) return 1; @@ -220,13 +271,22 @@ static int ucs_cmp(const void *key, const void *element) return 0; } -static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t count) +static bool is_in_interval16(uint16_t cp, const struct interval16 *intervals, size_t count) { if (cp < intervals[0].first || cp > intervals[count - 1].last) return false; return __inline_bsearch(&cp, intervals, count, - sizeof(*intervals), ucs_cmp) != NULL; + sizeof(*intervals), ucs_cmp16) != NULL; +} + +static bool is_in_interval32(uint32_t cp, const struct interval32 *intervals, size_t count) +{ + if (cp < intervals[0].first || cp > intervals[count - 1].last) + return false; + + return __inline_bsearch(&cp, intervals, count, + sizeof(*intervals), ucs_cmp32) != NULL; } /** @@ -237,7 +297,9 @@ static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t */ bool ucs_is_zero_width(uint32_t cp) { - return is_in_interval(cp, zero_width_ranges, ARRAY_SIZE(zero_width_ranges)); + return (cp <= 0xFFFF) + ? is_in_interval16(cp, zero_width_bmp, ARRAY_SIZE(zero_width_bmp)) + : is_in_interval32(cp, zero_width_non_bmp, ARRAY_SIZE(zero_width_non_bmp)); } /** @@ -248,17 +310,27 @@ bool ucs_is_zero_width(uint32_t cp) */ bool ucs_is_double_width(uint32_t cp) { - return is_in_interval(cp, double_width_ranges, ARRAY_SIZE(double_width_ranges)); + return (cp <= 0xFFFF) + ? is_in_interval16(cp, double_width_bmp, ARRAY_SIZE(double_width_bmp)) + : is_in_interval32(cp, double_width_non_bmp, ARRAY_SIZE(double_width_non_bmp)); } """) # Print summary - zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges) - double_width_count = sum(end - start + 1 for start, end in double_width_ranges) + zero_width_bmp_count = sum(end - start + 1 for start, end in zero_width_bmp) + zero_width_non_bmp_count = sum(end - start + 1 for start, end in zero_width_non_bmp) + double_width_bmp_count = sum(end - start + 1 for start, end in double_width_bmp) + double_width_non_bmp_count = sum(end - start + 1 for start, end in double_width_non_bmp) + + total_zero_width = zero_width_bmp_count + zero_width_non_bmp_count + total_double_width = double_width_bmp_count + double_width_non_bmp_count print(f"Generated {c_file} with:") - print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points") - print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points") + print(f"- {len(zero_width_bmp)} zero-width BMP ranges (16-bit) covering ~{zero_width_bmp_count} code points") + print(f"- {len(zero_width_non_bmp)} zero-width non-BMP ranges (32-bit) covering ~{zero_width_non_bmp_count} code points") + print(f"- {len(double_width_bmp)} double-width BMP ranges (16-bit) covering ~{double_width_bmp_count} code points") + print(f"- {len(double_width_non_bmp)} double-width non-BMP ranges (32-bit) covering ~{double_width_non_bmp_count} code points") + print(f"Total: {len(zero_width_bmp) + len(zero_width_non_bmp) + len(double_width_bmp) + len(double_width_non_bmp)} ranges covering ~{total_zero_width + total_double_width} code points") if __name__ == "__main__": generate_ucs_width()