+++ /dev/null
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-#
-# This script uses Python's unicodedata module to generate ucs_width.c
-
-import unicodedata
-import sys
-
-def generate_ucs_width():
- # Output file name
- c_file = "ucs_width.c"
-
- # Width data mapping
- width_map = {} # Maps code points to width (0, 1, 2)
-
- # Define emoji modifiers and components that should have zero width
- emoji_zero_width = [
- # Skin tone modifiers
- (0x1F3FB, 0x1F3FF), # Emoji modifiers (skin tones)
-
- # Variation selectors (note: VS16 is treated specially in vt.c)
- (0xFE00, 0xFE0F), # Variation Selectors 1-16
-
- # Gender and hair style modifiers
- (0x2640, 0x2640), # Female sign
- (0x2642, 0x2642), # Male sign
- (0x26A7, 0x26A7), # Transgender symbol
- (0x1F9B0, 0x1F9B3), # Hair components (red, curly, white, bald)
-
- # Tag characters
- (0xE0020, 0xE007E), # Tags
- ]
-
- # Mark these emoji modifiers as zero-width
- for start, end in emoji_zero_width:
- for cp in range(start, end + 1):
- try:
- width_map[cp] = 0
- except (ValueError, OverflowError):
- continue
-
- # Mark all regional indicators as single-width as they are usually paired
- # providing a combined with of 2.
- regional_indicators = (0x1F1E6, 0x1F1FF) # Regional indicator symbols A-Z
- start, end = regional_indicators
- for cp in range(start, end + 1):
- try:
- width_map[cp] = 1
- except (ValueError, OverflowError):
- continue
-
- # Process all assigned Unicode code points (Basic Multilingual Plane + Supplementary Planes)
- # Range 0x0 to 0x10FFFF (the full Unicode range)
- for block_start in range(0, 0x110000, 0x1000):
- block_end = block_start + 0x1000
- for cp in range(block_start, block_end):
- try:
- char = chr(cp)
-
- # Skip if already processed
- if cp in width_map:
- continue
-
- # Check if the character is a combining mark
- category = unicodedata.category(char)
-
- # Combining marks, format characters, zero-width characters
- if (category.startswith('M') or # Mark (combining)
- (category == 'Cf' and cp not in (0x061C, 0x06DD, 0x070F, 0x180E, 0x200F, 0x202E, 0x2066, 0x2067, 0x2068, 0x2069)) or
- cp in (0x200B, 0x200C, 0x200D, 0x2060, 0xFEFF)): # Known zero-width characters
- width_map[cp] = 0
- continue
-
- # Use East Asian Width property
- eaw = unicodedata.east_asian_width(char)
-
- if eaw in ('F', 'W'): # Fullwidth or Wide
- width_map[cp] = 2
- elif eaw in ('Na', 'H', 'N', 'A'): # Narrow, Halfwidth, Neutral, Ambiguous
- width_map[cp] = 1
- else:
- # Default to single-width for unknown
- width_map[cp] = 1
-
- except (ValueError, OverflowError):
- # Skip invalid code points
- continue
-
- # Process Emoji - generally double-width
- # Ranges according to Unicode Emoji standard
- emoji_ranges = [
- (0x1F000, 0x1F02F), # Mahjong Tiles
- (0x1F0A0, 0x1F0FF), # Playing Cards
- (0x1F300, 0x1F5FF), # Miscellaneous Symbols and Pictographs
- (0x1F600, 0x1F64F), # Emoticons
- (0x1F680, 0x1F6FF), # Transport and Map Symbols
- (0x1F700, 0x1F77F), # Alchemical Symbols
- (0x1F780, 0x1F7FF), # Geometric Shapes Extended
- (0x1F800, 0x1F8FF), # Supplemental Arrows-C
- (0x1F900, 0x1F9FF), # Supplemental Symbols and Pictographs
- (0x1FA00, 0x1FA6F), # Chess Symbols
- (0x1FA70, 0x1FAFF), # Symbols and Pictographs Extended-A
- ]
-
- for start, end in emoji_ranges:
- for cp in range(start, end + 1):
- if cp not in width_map or width_map[cp] != 0: # Don't override zero-width
- try:
- char = chr(cp)
- width_map[cp] = 2
- except (ValueError, OverflowError):
- continue
-
- # Optimize to create range tables
- def ranges_optimize(width_data, target_width):
- points = sorted([cp for cp, width in width_data.items() if width == target_width])
- if not points:
- return []
-
- # Group consecutive code points into ranges
- ranges = []
- start = points[0]
- prev = start
-
- for cp in points[1:]:
- if cp > prev + 1:
- ranges.append((start, prev))
- start = cp
- prev = cp
-
- # Add the last range
- ranges.append((start, prev))
- return ranges
-
- # Extract ranges for each width
- zero_width_ranges = ranges_optimize(width_map, 0)
- double_width_ranges = ranges_optimize(width_map, 2)
-
- # Get Unicode version information
- unicode_version = unicodedata.unidata_version
-
- # Generate C implementation file
- with open(c_file, 'w') as f:
- f.write(f"""\
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ucs_width.c - Unicode character width lookup
- *
- * Auto-generated by gen_ucs_width.py
- *
- * Unicode Version: {unicode_version}
- */
-
-#include <linux/types.h>
-#include <linux/array_size.h>
-#include <linux/bsearch.h>
-#include <linux/consolemap.h>
-
-struct interval {{
- uint32_t first;
- uint32_t last;
-}};
-
-/* Zero-width character ranges */
-static const struct interval zero_width_ranges[] = {{
-""")
-
- for start, end in zero_width_ranges:
- try:
- start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
- if start == end:
- comment = f"/* {start_char_desc} */"
- else:
- end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
- comment = f"/* {start_char_desc} - {end_char_desc} */"
- except:
- if start == end:
- comment = f"/* U+{start:05X} */"
- else:
- comment = f"/* U+{start:05X} - U+{end:05X} */"
-
- f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
-
- f.write("""\
-};
-
-/* Double-width character ranges */
-static const struct interval double_width_ranges[] = {
-""")
-
- for start, end in double_width_ranges:
- try:
- start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
- if start == end:
- comment = f"/* {start_char_desc} */"
- else:
- end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
- comment = f"/* {start_char_desc} - {end_char_desc} */"
- except:
- if start == end:
- comment = f"/* U+{start:05X} */"
- else:
- comment = f"/* U+{start:05X} - U+{end:05X} */"
-
- f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")
-
- f.write("""\
-};
-
-
-static int ucs_cmp(const void *key, const void *element)
-{
- uint32_t cp = *(uint32_t *)key;
- const struct interval *e = element;
-
- if (cp > e->last)
- return 1;
- if (cp < e->first)
- return -1;
- return 0;
-}
-
-static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t count)
-{
- if (cp < intervals[0].first || cp > intervals[count - 1].last)
- return false;
-
- return __inline_bsearch(&cp, intervals, count,
- sizeof(*intervals), ucs_cmp) != NULL;
-}
-
-/**
- * Determine if a Unicode code point is zero-width.
- *
- * @param ucs: Unicode code point (UCS-4)
- * Return: true if the character is zero-width, false otherwise
- */
-bool ucs_is_zero_width(uint32_t cp)
-{
- return is_in_interval(cp, zero_width_ranges, ARRAY_SIZE(zero_width_ranges));
-}
-
-/**
- * Determine if a Unicode code point is double-width.
- *
- * @param ucs: Unicode code point (UCS-4)
- * Return: true if the character is double-width, false otherwise
- */
-bool ucs_is_double_width(uint32_t cp)
-{
- return is_in_interval(cp, double_width_ranges, ARRAY_SIZE(double_width_ranges));
-}
-""")
-
- # Print summary
- zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
- double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
-
- print(f"Generated {c_file} with:")
- print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
- print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
-
-if __name__ == "__main__":
- generate_ucs_width()