/* * Usage: * gen_collate <INPUTDIR> [-o OUTPUTFILE] LOCALE ... * * Generate collation data from locales LOCALE. * Reads all LOCALE from INPUTDIR and writes collation data to OUTPUTFILE. * * The output file defaults to "locales_collate.h". */ /* TODO: * * add UNDEFINED at end if not specified * convert POSITION -> FORWARD,POSITION * * * deal with lowercase in <Uhhhh> * * what about reorders that keep the same rule? * * remove "unused" collation elements? (probably doesn't save much) * * add_rule function ... returns index into rule table after possibly adding custom-indexed rule * but don't forget about multichar weights... replace with strings of indexes * */ #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <stdint.h> #include <stdarg.h> #include <limits.h> #include <ctype.h> #include <assert.h> #include <errno.h> #include <search.h> typedef struct { char *name; /* */ int num_weights; /* */ int ii_shift; /* */ int ti_shift; /* */ int ii_len; /* */ int ti_len; /* */ int max_weight; /* */ int num_col_base; /* */ int max_col_index; /* */ int undefined_idx; /* */ int range_low; /* */ int range_count; /* high - low */ int range_base_weight; /* */ int num_starters; /* */ int range_rule_offset; /* */ int wcs2colidt_offset; /* */ int index2weight_offset; /* */ int index2ruleidx_offset; /* */ int multistart_offset; /* */ } base_locale_t; #define BASE_LOCALE_LEN 20 static base_locale_t base_locale_array[BASE_LOCALE_LEN]; static size_t base_locale_len; typedef struct { char *name; /* */ int base_idx; /* */ int undefined_idx; /* */ int overrides_offset; /* */ int multistart_offset; /* */ } der_locale_t; #define DER_LOCALE_LEN 300 static der_locale_t der_locale_array[DER_LOCALE_LEN]; static size_t der_locale_len; #define OVERRIDE_LEN 50000 static uint16_t override_buffer[OVERRIDE_LEN]; static size_t override_len; #define MULTISTART_LEN 10000 static uint16_t multistart_buffer[MULTISTART_LEN]; static size_t multistart_len; #define WCS2COLIDT_LEN 200000 static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN]; static size_t wcs2colidt_len; #define INDEX2WEIGHT_LEN 200000 static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN]; static size_t index2weight_len; static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN]; static size_t index2ruleidx_len; #define WEIGHTSTR_LEN 10000 static uint16_t weightstr_buffer[WEIGHTSTR_LEN]; static size_t weightstr_len; #define RULETABLE_LEN (1L<<16) static uint16_t ruletable_buffer[RULETABLE_LEN]; static size_t ruletable_len; #define RANGE (0x10000UL) typedef uint16_t tbl_item; static uint16_t u16_buf[10000]; static int u16_buf_len; static int u16_starter; typedef struct { uint16_t ii_len; uint16_t ti_len; uint16_t ut_len; unsigned char ii_shift; unsigned char ti_shift; tbl_item *ii; tbl_item *ti; tbl_item *ut; } table_data; static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl); #define MAX_COLLATION_WEIGHTS 4 #define MAX_FNO 1 #define MAX_FILES (MAX_FNO + 1) static FILE *fstack[MAX_FILES]; static char *fname[MAX_FILES]; static int lineno[MAX_FILES]; static int fno = -1; static tbl_item wcs2index[RANGE]; static char linebuf[1024]; static char *pos; static char *pos_e = NULL; static char end_of_token = 0; /* slot to save */ #define IN_ORDER 0x01 #define IN_REORDER 0x02 #define IN_REORDER_SECTIONS 0x04 static int order_state; static int cur_num_weights; /* number of weights in current use */ static char cur_rule[MAX_COLLATION_WEIGHTS]; static int anonsection = 0; typedef struct ll_item_struct ll_item_t; struct ll_item_struct { ll_item_t *next; ll_item_t *prev; void *data; int data_type; int idx; }; static ll_item_t *reorder_section_ptr = NULL; static int superset; static int superset_order_start_cnt; /* only support one order for now */ static int superset_in_sync; static ll_item_t *comm_cur_ptr; static ll_item_t *comm_prev_ptr; enum { R_FORWARD = 0x01, R_POSITION = 0x02, R_BACKWARD = 0x04 /* must be largest in value */ }; typedef struct { size_t num_weights; char rule[MAX_COLLATION_WEIGHTS]; const char *colitem[MAX_COLLATION_WEIGHTS]; } weight_t; static void *root_weight = NULL; size_t unique_weights = 0; typedef struct { const char *symbol; weight_t *weight; } weighted_item_t; typedef struct { const char *symbol1; const char *symbol2; int length; weight_t *weight; } range_item_t; typedef struct { const char *name; ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */ size_t num_items; size_t num_rules; char rules[MAX_COLLATION_WEIGHTS]; } section_t; static section_t *cur_section = NULL; typedef struct { const char *symbol; ll_item_t *node; } wi_index_t; typedef struct col_locale_struct col_locale_t; struct col_locale_struct { char *name; void *root_colitem; /* all base and derived, or just derived */ void *root_element; void *root_scripts; void *root_wi_index; void *root_wi_index_reordered; ll_item_t *section_list; col_locale_t *base_locale; /* null if this is a base */ void *root_derived_wi; ll_item_t *derived_list; void *root_starter_char; void *root_starter_all; ll_item_t *undefined_idx; }; typedef struct { const char *symbol; int idx; } col_index_t; static void *root_col_locale = NULL; typedef struct { const char *keyword; void (*handler)(void); } keyword_table_t; typedef struct { const char *string; const char *element; /* NULL if collating symbol */ } colitem_t; static col_locale_t *cur_base = NULL; static col_locale_t *cur_derived = NULL; static col_locale_t *cur_col = NULL; static void *root_sym = NULL; static size_t num_sym = 0; static size_t mem_sym = 0; static const char *inputdir; static size_t inputdir_len; static unsigned verbose = 0; enum { VINFO = (1<<0), VDETAIL = (1<<1), }; static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2))); static void *xmalloc(size_t n); static char *xsymdup(const char *s); /* only allocate once... store in a tree */ static void pushfile(char *filename); static void popfile(void); static void processfile(void); static int iscommentchar(int); static void eatwhitespace(void); static int next_line(void); static char *next_token(void); static void do_unrecognized(void); static col_locale_t *new_col_locale(char *name); static ll_item_t *new_ll_item(int data_type, void *data); static weight_t *register_weight(weight_t *w); static size_t ll_len(ll_item_t *l); static size_t ll_count(ll_item_t *l, int mask); static void add_wi_index(ll_item_t *l); static size_t tnumnodes(const void *root); static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl); static void mark_reordered(const char *sym); static ll_item_t *find_wi_index_reordered(const char *sym); static ll_item_t *next_comm_ptr(void); static ll_item_t *init_comm_ptr(void); static ll_item_t *find_ll_last(ll_item_t *p); static void dump_weights(const char *name); static void finalize_base(void); static int is_ucode(const char *s); static int sym_cmp(const void *n1, const void *n2); static void do_starter_lists(col_locale_t *cl); static void dump_base_locale(int n); static void dump_der_locale(int n); static void dump_collate(FILE *fp); enum { DT_SECTION = 0x01, DT_WEIGHTED = 0x02, DT_REORDER = 0x04, /* a section to support reorder_after */ DT_COL_LOCALE = 0x08, DT_RANGE = 0x10, }; static int verbose_msg(const unsigned lvl, const char *fmt, ...) { va_list arg; int ret = 0; if (verbose & lvl) { va_start(arg, fmt); ret = vfprintf(stderr, fmt, arg); va_end(arg); } return ret; } static section_t *new_section(const char *name) { section_t *p; char buf[128]; p = xmalloc(sizeof(section_t)); if (!name) { /* anonymous section */ name = buf; snprintf(buf, sizeof(buf), "anon%05d", anonsection); ++anonsection; } else if (*name != '<') { /* reorder */ name = buf; snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection); ++anonsection; } /* verbose_msg(VDETAIL, "section %s\n", name); */ p->name = xsymdup(name); p->itm_list = NULL; p->num_items = 0; p->num_rules = 0; memset(p->rules, 0, MAX_COLLATION_WEIGHTS); /* cur_num_weights = p->num_rules = 0; */ /* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); */ /* memset(cur_rule, R_FORWARD, 4); */ if (*p->name == 'a') { cur_num_weights = p->num_rules = 4; memset(p->rules, R_FORWARD, 4); memset(cur_rule, R_FORWARD, 4); p->rules[3] |= R_POSITION; cur_rule[3] |= R_POSITION; } /* verbose_msg(VDETAIL, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */ return p; } static void do_order_start(void); static void do_order_end(void); static void do_reorder_after(void); static void do_reorder_end(void); static void do_reorder_sections_after(void); static void do_reorder_sections_end(void); static void do_copy(void); static void do_colsym(void); static void do_colele(void); static void do_script(void); static void do_range(void); static col_locale_t *new_col_locale(char *name); static int colitem_cmp(const void *n1, const void *n2); static int colelement_cmp(const void *n1, const void *n2); static void del_colitem(colitem_t *p); static colitem_t *new_colitem(char *item, char *def); static void add_colitem(char *item, char *def); static void add_script(const char *s); static unsigned int add_rule(weighted_item_t *wi); static unsigned int add_range_rule(range_item_t *ri); static const keyword_table_t keyword_table[] = { { "collating-symbol", do_colsym }, { "collating-element", do_colele }, { "script", do_script }, { "copy", do_copy }, { "order_start", do_order_start }, { "order_end", do_order_end }, { "order-end", do_order_end }, { "reorder-after", do_reorder_after }, { "reorder-end", do_reorder_end }, { "reorder-sections-after", do_reorder_sections_after }, { "reorder-sections-end", do_reorder_sections_end }, { "UCLIBC_RANGE", do_range }, { NULL, do_unrecognized } }; static void do_unrecognized(void) { #if 1 error_msg("warning: unrecognized: %s", pos); #else /* verbose_msg(VDETAIL, "warning: unrecognized initial keyword \"%s\"\n", pos); */ fprintf(stderr, "warning: unrecognized: %s", pos); if (end_of_token) { fprintf(stderr, "%c%s", end_of_token, pos_e+1); } fprintf(stderr, "\n"); #endif } /* typedef struct { */ /* const char *symbol1; */ /* const char *symbol2; */ /* int length; */ /* weight_t *weight; */ /* } range_item_t; */ static void do_range(void) { range_item_t *ri; weight_t w; int i; char *s; char *s1; char *s2; const char **ci; ll_item_t *lli; assert(!superset); assert(order_state == IN_ORDER); s1 = next_token(); if (!s1) { error_msg("missing start of range"); } if (!is_ucode(s1)) { error_msg("start of range is not a ucode: %s", s1); } s1 = xsymdup(s1); s2 = next_token(); if (!s2) { error_msg("missing end of range"); } if (!is_ucode(s2)) { error_msg("end of range is not a ucode: %s", s2); } s2 = xsymdup(s2); ri = (range_item_t *) xmalloc(sizeof(range_item_t)); ri->symbol1 = s1; ri->symbol2 = s2; ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16); if (ri->length <= 0) { error_msg("illegal range length %d", ri->length); } s = next_token(); w.num_weights = cur_num_weights; for (i=0 ; i < cur_num_weights ; i++) { w.rule[i] = cur_rule[i]; } ci = w.colitem + (i-1); /* now i == cur_num_weights */ #define STR_DITTO "." while (s && *s && i) { --i; if (*s == ';') { ci[-i] = xsymdup(STR_DITTO); if (*++s) { continue; } } if (*s) { ci[-i] = xsymdup(s); } s = next_token(); if (s) { if (*s == ';') { ++s; } else if (i) { error_msg("missing seperator"); } } } if (s) { error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); } while (i) { /* missing weights are not an error */ --i; ci[-i] = xsymdup(STR_DITTO); } ri->weight = register_weight(&w); /* if ((i = is_ucode(t)) != 0) { */ /* assert(!t[i]); */ /* add_colitem(t, NULL); */ /* } */ lli = new_ll_item(DT_RANGE, ri); if (!cur_section->itm_list) { /* printf("creating new item list: %s\n", wi->symbol); */ cur_section->itm_list = lli; lli->prev = lli->next = lli; ++cur_section->num_items; } else { insque(lli, cur_section->itm_list->prev); /* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */ ++cur_section->num_items; } /* add_wi_index(lli); */ } static weighted_item_t *add_weight(char *t) { weighted_item_t *wi; weight_t w; int i; char *s; const char **ci; t = xsymdup(t); s = next_token(); w.num_weights = cur_num_weights; for (i=0 ; i < cur_num_weights ; i++) { w.rule[i] = cur_rule[i]; } ci = w.colitem + (i-1); /* now i == cur_num_weights */ while (s && *s && i) { --i; if (*s == ';') { ci[-i] = xsymdup(STR_DITTO); if (*++s) { continue; } } if (*s) { if (!strcmp(s,t)) { s = STR_DITTO; } ci[-i] = xsymdup(s); } s = next_token(); if (s) { if (*s == ';') { ++s; } else if (i) { error_msg("missing seperator"); } } } if (s) { error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); } while (i) { /* missing weights are not an error */ --i; ci[-i] = xsymdup(STR_DITTO); } wi = xmalloc(sizeof(weighted_item_t)); wi->symbol = t; wi->weight = register_weight(&w); if ((i = is_ucode(t)) != 0) { assert(!t[i]); add_colitem(t, NULL); } return wi; } static void add_superset_weight(char *t) { ll_item_t *lli; weighted_item_t *wi; if (!comm_cur_ptr || (strcmp(t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol) != 0) ) { /* now out of sync */ if (superset_in_sync) { /* need a new section */ superset_in_sync = 0; cur_section = new_section("R"); cur_num_weights = cur_section->num_rules = ((section_t *)(cur_base->section_list->data))->num_rules; memcpy(cur_rule, ((section_t *)(cur_base->section_list->data))->rules, MAX_COLLATION_WEIGHTS); memcpy(cur_section->rules, ((section_t *)(cur_base->section_list->data))->rules, MAX_COLLATION_WEIGHTS); insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list)); assert(comm_prev_ptr); lli = new_ll_item(DT_REORDER, cur_section); lli->prev = lli->next = lli; insque(lli, comm_prev_ptr); /* verbose_msg(VDETAIL, " subsection -----------------------\n"); */ } /* verbose_msg(VDETAIL, " %s %s\n", t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol); */ wi = add_weight(t); lli = new_ll_item(DT_WEIGHTED, wi); mark_reordered(wi->symbol); /* printf("reorder: %s\n", t); */ if (!cur_section->itm_list) { cur_section->itm_list = lli; lli->prev = lli->next = lli; ++cur_section->num_items; } else { insque(lli, cur_section->itm_list->prev); ++cur_section->num_items; } add_wi_index(lli); } else { /* in sync */ superset_in_sync = 1; next_comm_ptr(); } } static void do_weight(char *t) { weighted_item_t *wi; ll_item_t *lli; if (superset) { add_superset_weight(t); return; } switch(order_state) { case 0: /* fprintf(stdout, "no-order weight: %s\n", t); */ /* break; */ case IN_ORDER: /* in a section */ /* fprintf(stdout, "weight: %s\n", t); */ wi = add_weight(t); lli = new_ll_item(DT_WEIGHTED, wi); if (!cur_section->itm_list) { /* fprintf(stdout, "creating new item list: %s %s %p\n", wi->symbol, cur_section->name, lli); */ cur_section->itm_list = lli; lli->prev = lli->next = lli; ++cur_section->num_items; } else { insque(lli, cur_section->itm_list->prev); /* fprintf(stdout, "adding item to list: %d - %s %p\n", ll_len(cur_section->itm_list), wi->symbol, lli); */ ++cur_section->num_items; } add_wi_index(lli); break; case IN_REORDER: /* std rule - but in a block with an insert-after pt */ wi = add_weight(t); lli = new_ll_item(DT_WEIGHTED, wi); mark_reordered(wi->symbol); /* fprintf(stdout, "reorder: %s %s %p\n", t, cur_section->name, lli); */ if (!cur_section->itm_list) { cur_section->itm_list = lli; lli->prev = lli->next = lli; ++cur_section->num_items; } else { insque(lli, cur_section->itm_list->prev); ++cur_section->num_items; } add_wi_index(lli); break; case IN_REORDER_SECTIONS: t = xsymdup(t); if (next_token() != NULL) { error_msg("trailing text in reorder section item: %s", pos); } lli = cur_col->section_list; do { if (lli->data_type & DT_SECTION) { if (!strcmp(((section_t *)(lli->data))->name, t)) { lli->data_type = DT_REORDER; lli = new_ll_item(DT_REORDER, (section_t *)(lli->data)); insque(lli, reorder_section_ptr); reorder_section_ptr = lli; return; } } lli = lli->next; } while (lli); error_msg("reorder_sections_after for non-base item currently not supported: %s", t); /* fprintf(stdout, "reorder_secitons: %s\n", t); */ break; default: error_msg("invalid order_state %d", order_state); } } static int col_locale_cmp(const void *n1, const void *n2) { return strcmp(((const col_locale_t *) n1)->name, ((const col_locale_t *) n2)->name); } static void processfile(void) { char *t; const keyword_table_t *k; order_state = 0; /* cur_num_weights = 0; */ /* cur_num_weights = 4; */ /* memset(cur_rule, R_FORWARD, 4); */ if (cur_col != cur_base) { cur_col->base_locale = cur_base; cur_col->undefined_idx = cur_base->undefined_idx; if (!cur_base->derived_list) { cur_base->derived_list = new_ll_item(DT_COL_LOCALE, cur_col); } else { insque(new_ll_item(DT_COL_LOCALE, cur_col), find_ll_last(cur_base->derived_list)); } } if (tfind(cur_col, &root_col_locale, col_locale_cmp)) { error_msg("attempt to read locale: %s", cur_col->name); } if (!tsearch(cur_col, &root_col_locale, col_locale_cmp)) { error_msg("OUT OF MEMORY!"); } if (superset) { superset_order_start_cnt = 0; superset_in_sync = 0; init_comm_ptr(); } while (next_line()) { /* printf("%5d:", lineno[fno]); */ /* while ((t = next_token()) != NULL) { */ /* printf(" |%s|", t); */ /* printf("\n"); */ /* } */ t = next_token(); assert(t); assert(t == pos); if ((*t == '<') || (!strcmp(t, "UNDEFINED"))) { do_weight(t); } else { for (k = keyword_table ; k->keyword ; k++) { if (!strcmp(k->keyword, t)) { break; } } k->handler(); } } if (cur_base == cur_col) { verbose_msg(VDETAIL, "Base: %15s", cur_col->name); } else { #if 1 if (!cur_col->undefined_idx) { #if 0 if (superset) { if (superset_order_start_cnt == 1) { --superset_order_start_cnt; /* ugh.. hack this */ } } #endif /* This is an awful hack to get around the problem of unspecified UNDEFINED * definitions in the supported locales derived from iso14651_t1. */ if (!strcmp(cur_base->name, "iso14651_t1")) { fprintf(stderr, "Warning: adding UNDEFINED entry for %s\n", cur_col->name); strcpy(linebuf, "script <UNDEFINED_SECTION>\n"); pos_e = NULL; pos = linebuf; t = next_token(); assert(t); assert(t == pos); do_script(); strcpy(linebuf, "order_start <UNDEFINED_SECTION>;forward;backward;forward;forward,position\n"); pos_e = NULL; pos = linebuf; t = next_token(); assert(t); assert(t == pos); do_order_start(); strcpy(linebuf, "UNDEFINED IGNORE;IGNORE;IGNORE\n"); pos_e = NULL; pos = linebuf; t = next_token(); assert(t); assert(t == pos); do_weight(t); strcpy(linebuf, "order_end\n"); pos_e = NULL; pos = linebuf; t = next_token(); assert(t); assert(t == pos); do_order_end(); } else { error_msg("no definition of UNDEFINED for %s", cur_col->name); } } #endif verbose_msg(VDETAIL, " Der: %15s", cur_col->name); } { #if 0 ll_item_t *p = cur_col->section_list; #endif verbose_msg(VDETAIL, "%6u weights", tnumnodes(cur_col->root_wi_index)); if (cur_base) { verbose_msg(VDETAIL, " %6u der %6u reor %6u starter - %u new stubs", tnumnodes(cur_base->root_derived_wi), tnumnodes(cur_base->root_wi_index_reordered), tnumnodes(cur_base->root_starter_char), ll_count(cur_col->section_list, DT_REORDER)); } verbose_msg(VDETAIL, "\n"); #if 0 while (p) { assert(((section_t *)(p->data))->num_items == ll_len(((section_t *)(p->data))->itm_list)); if (!p->next && ((*((section_t *)(p->data))->name == 'a') && (((section_t *)(p->data))->num_items == 0)) ) { break; } if (!(p->data_type & DT_REORDER)) { if ((*((section_t *)(p->data))->name != 'a') || (((section_t *)(p->data))->num_items > 0) ) { verbose_msg(VDETAIL, /* "\t%-15s %zu\n", */ "\t%-15s %6u\n", ((section_t *)(p->data))->name, ((section_t *)(p->data))->num_items); } } p = p->next; } #endif } } static void print_colnode(const void *ptr, VISIT order, int level) { const colitem_t *p = *(const colitem_t **) ptr; if (order == postorder || order == leaf) { printf("collating item = \"%s\"", p->string); if (p->element) { printf(" is %s", p->element); } printf("\n"); } } static void print_weight_node(const void *ptr, VISIT order, int level) { const weight_t *p = *(const weight_t **) ptr; int i; if (order == postorder || order == leaf) { printf("weight: (%d) ", p->num_weights); for (i = 0 ; i < p->num_weights ; i++) { if (p->rule[i] & R_FORWARD) { printf("F"); } if (p->rule[i] & R_BACKWARD) { printf("B"); } if (p->rule[i] & R_POSITION) { printf("P"); } printf(","); } for (i = 0 ; i < p->num_weights ; i++) { printf(" %s", p->colitem[i]); } printf("\n"); } } typedef struct { const char *der_name; int base_locale; } deps_t; enum { BASE_iso14651_t1, BASE_comm, BASE_cs_CZ, BASE_ar_SA, BASE_th_TH, BASE_ja_JP, BASE_ko_KR, BASE_MAX }; static const char *base_name[] = { "iso14651_t1", "comm", "cs_CZ", "ar_SA", "th_TH", "ja_JP", "ko_KR" }; static ll_item_t *locale_list[BASE_MAX]; static void init_locale_list(void) { int i; for (i=0 ; i < BASE_MAX ; i++) { locale_list[i] = (ll_item_t *) xmalloc(sizeof(ll_item_t)); locale_list[i]->prev = locale_list[i]->next = locale_list[i]; locale_list[i]->data = (void *) base_name[i]; } } deps_t deps[] = { { "af_ZA", BASE_iso14651_t1 }, { "am_ET", BASE_iso14651_t1 }, { "ar_AE", BASE_iso14651_t1 }, { "ar_BH", BASE_iso14651_t1 }, { "ar_DZ", BASE_iso14651_t1 }, { "ar_EG", BASE_iso14651_t1 }, { "ar_IN", BASE_iso14651_t1 }, { "ar_IQ", BASE_iso14651_t1 }, { "ar_JO", BASE_iso14651_t1 }, { "ar_KW", BASE_iso14651_t1 }, { "ar_LB", BASE_iso14651_t1 }, { "ar_LY", BASE_iso14651_t1 }, { "ar_MA", BASE_iso14651_t1 }, { "ar_OM", BASE_iso14651_t1 }, { "ar_QA", BASE_iso14651_t1 }, { "ar_SA", BASE_ar_SA }, { "ar_SD", BASE_iso14651_t1 }, { "ar_SY", BASE_iso14651_t1 }, { "ar_TN", BASE_iso14651_t1 }, { "ar_YE", BASE_iso14651_t1 }, { "az_AZ", BASE_iso14651_t1 }, { "be_BY", BASE_iso14651_t1 }, { "bg_BG", BASE_iso14651_t1 }, { "bn_BD", BASE_iso14651_t1 }, { "bn_IN", BASE_iso14651_t1 }, { "br_FR", BASE_iso14651_t1 }, { "bs_BA", BASE_iso14651_t1 }, { "ca_ES", BASE_comm }, { "cs_CZ", BASE_cs_CZ }, { "cy_GB", BASE_iso14651_t1 }, { "da_DK", BASE_comm }, { "de_AT", BASE_iso14651_t1 }, { "de_BE", BASE_iso14651_t1 }, { "de_CH", BASE_iso14651_t1 }, { "de_DE", BASE_iso14651_t1 }, { "de_LU", BASE_iso14651_t1 }, { "el_GR", BASE_iso14651_t1 }, { "en_AU", BASE_iso14651_t1 }, { "en_BW", BASE_iso14651_t1 }, { "en_CA", BASE_comm }, { "en_DK", BASE_iso14651_t1 }, { "en_GB", BASE_iso14651_t1 }, { "en_HK", BASE_iso14651_t1 }, { "en_IE", BASE_iso14651_t1 }, { "en_IN", BASE_iso14651_t1 }, { "en_NZ", BASE_iso14651_t1 }, { "en_PH", BASE_iso14651_t1 }, { "en_SG", BASE_iso14651_t1 }, { "en_US", BASE_iso14651_t1 }, { "en_ZA", BASE_iso14651_t1 }, { "en_ZW", BASE_iso14651_t1 }, { "eo_EO", BASE_iso14651_t1 }, { "es_AR", BASE_comm }, { "es_BO", BASE_comm }, { "es_CL", BASE_comm }, { "es_CO", BASE_comm }, { "es_CR", BASE_comm }, { "es_DO", BASE_comm }, { "es_EC", BASE_comm }, { "es_ES", BASE_comm }, { "es_GT", BASE_comm }, { "es_HN", BASE_comm }, { "es_MX", BASE_comm }, { "es_NI", BASE_comm }, { "es_PA", BASE_comm }, { "es_PE", BASE_comm }, { "es_PR", BASE_comm }, { "es_PY", BASE_comm }, { "es_SV", BASE_comm }, { "es_US", BASE_comm }, { "es_UY", BASE_comm }, { "es_VE", BASE_comm }, { "et_EE", BASE_comm }, { "eu_ES", BASE_iso14651_t1 }, { "fa_IR", BASE_iso14651_t1 }, { "fi_FI", BASE_comm }, { "fo_FO", BASE_comm }, { "fr_BE", BASE_iso14651_t1 }, { "fr_CA", BASE_comm }, { "fr_CH", BASE_iso14651_t1 }, { "fr_FR", BASE_iso14651_t1 }, { "fr_LU", BASE_iso14651_t1 }, { "ga_IE", BASE_iso14651_t1 }, { "gd_GB", BASE_iso14651_t1 }, { "gl_ES", BASE_comm }, { "gv_GB", BASE_iso14651_t1 }, { "he_IL", BASE_iso14651_t1 }, { "hi_IN", BASE_iso14651_t1 }, { "hr_HR", BASE_comm }, { "hu_HU", BASE_iso14651_t1 }, { "hy_AM", BASE_iso14651_t1 }, { "id_ID", BASE_iso14651_t1 }, { "is_IS", BASE_comm }, { "it_CH", BASE_iso14651_t1 }, { "it_IT", BASE_iso14651_t1 }, { "iw_IL", BASE_iso14651_t1 }, { "ja_JP", BASE_ja_JP }, { "ka_GE", BASE_iso14651_t1 }, { "kl_GL", BASE_comm }, { "ko_KR", BASE_ko_KR }, { "kw_GB", BASE_iso14651_t1 }, { "lt_LT", BASE_comm }, { "lv_LV", BASE_comm }, { "mi_NZ", BASE_iso14651_t1 }, { "mk_MK", BASE_iso14651_t1 }, { "mr_IN", BASE_iso14651_t1 }, { "ms_MY", BASE_iso14651_t1 }, { "mt_MT", BASE_iso14651_t1 }, { "nl_BE", BASE_iso14651_t1 }, { "nl_NL", BASE_iso14651_t1 }, { "nn_NO", BASE_iso14651_t1 }, { "no_NO", BASE_comm }, { "oc_FR", BASE_iso14651_t1 }, { "pl_PL", BASE_comm }, { "pt_BR", BASE_iso14651_t1 }, { "pt_PT", BASE_iso14651_t1 }, { "ro_RO", BASE_iso14651_t1 }, { "ru_RU", BASE_iso14651_t1 }, { "ru_UA", BASE_iso14651_t1 }, { "se_NO", BASE_iso14651_t1 }, { "sk_SK", BASE_cs_CZ }, { "sl_SI", BASE_comm }, { "sq_AL", BASE_iso14651_t1 }, { "sr_YU", BASE_iso14651_t1 }, { "sv_FI", BASE_comm }, { "sv_SE", BASE_iso14651_t1 }, { "ta_IN", BASE_iso14651_t1 }, { "te_IN", BASE_iso14651_t1 }, { "tg_TJ", BASE_iso14651_t1 }, { "th_TH", BASE_th_TH }, { "ti_ER", BASE_iso14651_t1 }, { "ti_ET", BASE_iso14651_t1 }, { "tl_PH", BASE_iso14651_t1 }, { "tr_TR", BASE_comm }, { "tt_RU", BASE_iso14651_t1 }, { "uk_UA", BASE_iso14651_t1 }, { "ur_PK", BASE_iso14651_t1 }, { "uz_UZ", BASE_iso14651_t1 }, { "vi_VN", BASE_iso14651_t1 }, { "wa_BE", BASE_iso14651_t1 }, { "yi_US", BASE_iso14651_t1 }, { "zh_CN", BASE_iso14651_t1 }, { "zh_HK", BASE_iso14651_t1 }, { "zh_SG", BASE_iso14651_t1 }, { "zh_TW", BASE_iso14651_t1 }, }; static int der_count[BASE_MAX]; static const char *new_args[500]; static int new_arg_count; static int dep_cmp(const void *s1, const void *s2) { return strcmp( (const char *) s1, ((const deps_t *) s2)->der_name); } static int old_main(int argc, char **argv); int main(int argc, char **argv) { const deps_t *p; ll_item_t *lli; int i; int total; char *output_file = "locale_collate.h"; unsigned verbosity = 0; if (argc < 3) { return EXIT_FAILURE; } --argc; inputdir = strdup(*++argv); inputdir_len = strlen(inputdir); init_locale_list(); while (--argc) { ++argv; if (!strcmp(*argv, "-o")) { --argc; if (*++argv == NULL) { printf("-o <outfile> requires an argument\n"); return EXIT_FAILURE; } output_file = strdup(*argv); continue; } else if (!strcmp(*argv, "-v")) { verbosity++; continue; } p = (const deps_t *) bsearch(*argv, deps, sizeof(deps)/sizeof(deps[0]), sizeof(deps[0]), dep_cmp); if (!p) { if (!strcmp("C", *argv)) { printf("ignoring %s locale\n", *argv); continue; } else { printf("%s not found\n", *argv); return EXIT_FAILURE; } } i = p->base_locale; ++der_count[i]; if (!strcmp(base_name[i], *argv)) { /* same name as base, so skip after count incremented */ continue; } /* add it to the list. the main body will catch duplicates */ lli = (ll_item_t *) xmalloc(sizeof(ll_item_t)); lli->prev = lli->next = NULL; lli->data = (void *) *argv; insque(lli, locale_list[i]); } total = 0; for (i=0 ; i < BASE_MAX ; i++) { /* printf("der_count[%2d] = %3d\n", i, der_count[i]); */ total += der_count[i]; } /* printf("total = %d\n", total); */ new_args[new_arg_count++] = "dummyprogramname"; for (i=0 ; i < BASE_MAX ; i++) { if (!der_count[i]) { continue; } new_args[new_arg_count++] = (i == BASE_comm) ? "-c" : "-b"; lli = locale_list[i]; do { new_args[new_arg_count++] = (const char *) (lli->data); lli = lli->next; } while (lli != locale_list[i]); new_args[new_arg_count++] = "-f"; } for (i=0; i < verbosity; i++) new_args[new_arg_count++] = "-v"; new_args[new_arg_count++] = "-o"; new_args[new_arg_count++] = output_file; /* for (i=0 ; i < new_arg_count ; i++) { printf("%3d: %s\n", i, new_args[i]); } */ return old_main(new_arg_count, (char **) new_args); } /* usage... prog -b basefile derived {derived} -s single {single} */ static int old_main(int argc, char **argv) { int next_is_base = 0; int next_is_subset = 0; char *output_file = NULL; superset = 0; while (--argc) { ++argv; if (**argv == '-') { if ((*argv)[1] == 'd') { dump_weights((*argv) + 2); } else if ((*argv)[1] == 'f') { /* dump all weight rules */ finalize_base(); } else if ((*argv)[1] == 'R') { /* dump all weight rules */ twalk(root_weight, print_weight_node); } else if (((*argv)[1] == 'c') && !(*argv)[2]) { /* new common subset */ cur_base = cur_derived = NULL; next_is_subset = 1; next_is_base = 1; superset = 0; } else if (((*argv)[1] == 'b') && !(*argv)[2]) { /* new base locale */ cur_base = cur_derived = NULL; next_is_subset = 0; next_is_base = 1; superset = 0; } else if (((*argv)[1] == 's') && !(*argv)[2]) { /* single locales follow */ cur_base = cur_derived = NULL; next_is_subset = 0; next_is_base = 2; superset = 0; } else if (((*argv)[1] == 'o') && !(*argv)[2]) { /* output file */ --argc; output_file = *++argv; } else if (((*argv)[1] == 'v') && !(*argv)[2]) { /* verbose */ ++verbose; } else { error_msg("unrecognized option %s", *argv); } continue; } /* new file */ new_col_locale(*argv); /* automaticly sets cur_col */ if (next_is_base) { cur_base = cur_col; } else { cur_derived = cur_col; } pushfile(*argv); /* verbose_msg(VDETAIL, "processing file %s\n", *argv); */ processfile(); /* this does a popfile */ /* twalk(cur_col->root_colitem, print_colnode); */ if (next_is_base == 1) { next_is_base = 0; } if (next_is_subset) { next_is_subset = 0; superset = 1; } } verbose_msg(VINFO, "success!\n"); verbose_msg(VINFO, /* "num_sym=%zu mem_sym=%zu unique_weights=%zu\n", */ "num_sym=%u mem_sym=%u unique_weights=%u\n", num_sym, mem_sym, unique_weights); /* twalk(root_weight, print_weight_node); */ verbose_msg(VINFO, "num base locales = %d num derived locales = %d\n", base_locale_len, der_locale_len); verbose_msg(VINFO, "override_len = %d multistart_len = %d weightstr_len = %d\n" "wcs2colidt_len = %d index2weight_len = %d index2ruleidx_len = %d\n" "ruletable_len = %d\n" "total size is %d bytes or %d kB\n", override_len, multistart_len, weightstr_len, wcs2colidt_len, index2weight_len, index2ruleidx_len, ruletable_len, (override_len + multistart_len + weightstr_len + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len) * 2, (override_len + multistart_len + weightstr_len + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len + 511) / 512); #if 0 { int i; for (i=0 ; i < base_locale_len ; i++) { dump_base_locale(i); } for (i=0 ; i < der_locale_len ; i++) { dump_der_locale(i); } } #endif { FILE *fp = fopen(output_file, "w"); if (!fp) { error_msg("cannot open output file '%s'!", output_file); } dump_collate(fp); if (ferror(fp) || fclose(fp)) { error_msg("write error or close error for output file!\n"); } } return EXIT_SUCCESS; } static void error_msg(const char *fmt, ...) { va_list arg; fprintf(stderr, "Error: "); if (fno >= 0) { fprintf(stderr, "file %s (%d): ", fname[fno], lineno[fno]); } va_start(arg, fmt); vfprintf(stderr, fmt, arg); va_end(arg); fprintf(stderr, "\n"); exit(EXIT_FAILURE); } static void pushfile(char *filename) { char *inputfile; size_t inputfile_len; if (fno >= MAX_FNO) { error_msg("file stack size exceeded"); } inputfile_len = inputdir_len + strlen(filename) + 2; inputfile = xmalloc(inputfile_len); memset(inputfile, 0, inputfile_len); sprintf(inputfile, "%s/%s", inputdir, filename); if (!(fstack[++fno] = fopen(inputfile, "r"))) { --fno; /* oops */ error_msg("cannot open file %s: %s", inputfile, strerror(errno)); } fname[fno] = xsymdup(inputfile); lineno[fno] = 0; } static void popfile(void) { if (fno < 0) { error_msg("pop on empty file stack"); } /* free(fname[fno]); */ fclose(fstack[fno]); --fno; } static void eatwhitespace(void) { while (isspace(*pos)) { ++pos; } } static int iscommentchar(int c) { return ((c == '#') || (c == '%')); } static int next_line(void) { size_t n; char *s = linebuf; assert(fno >= 0); pos_e = NULL; do { if (fgets(s, sizeof(linebuf), fstack[fno]) != NULL) { ++lineno[fno]; n = strlen(linebuf); if ((n == sizeof(linebuf) - 1) && (linebuf[n-1] != '\n')) { /* Either line is too long or last line is very long with * no trailing newline. But we'll always treat it as an * errro. */ error_msg("line too long?"); } --n; /* Be careful... last line doesn't need a newline. */ if (linebuf[n] == '\n') { linebuf[n--] = 0; /* trim trailing newline */ } pos = linebuf; eatwhitespace(); if (*pos && !iscommentchar(*pos)) { /* not empty or comment line */ return 1; /* got a line */ } } else { /* eof */ popfile(); } } while (fno >= 0); return 0; } static char *next_token(void) { char *p; #if 0 if (pos_e == NULL) { return NULL pos = pos_e; *pos = end_of_token; end_of_token = 0; } #else if (pos_e != NULL) { pos = pos_e; *pos = end_of_token; end_of_token = 0; } #endif eatwhitespace(); p = pos; if (!*p || iscommentchar(*p)) { /* end of line or start of comment */ pos = pos_e = NULL; *p = 0; /* treat comment as end of line */ /* fprintf(stdout, "returning NUL token |%s|\n", pos); */ return NULL; #if 1 } else if (*p == '<') { /* collating symbol, element, or value */ while (*++p) { if ((*p == '/') && p[1]) { ++p; continue; } if (*p == '>') { pos_e = ++p; end_of_token = *p; *p = 0; /* fprintf(stdout, "returning col token |%s|\n", pos); */ return pos; } } } else if (*p == '"') { /* collating element value? */ while (*++p) { if (*p == '"') { /* found the end of the quoted string */ pos_e = ++p; end_of_token = *p; *p = 0; /* fprintf(stdout, "returning quote token |%s|\n", pos); */ return pos; } } #endif } else { /* some kind of keyword */ while (*++p) { if (isspace(*p) || (*p == ';')) { break; } } pos_e = p; end_of_token = *p; *p = 0; /* fprintf(stdout, "returning key token |%s|\n", pos); */ return pos; } error_msg("illegal token |%s|", pos); } static void *xmalloc(size_t n) { void *p; if (!(p = malloc(n))) { error_msg("OUT OF MEMORY"); } return p; } static void do_copy(void) { char *s; char *e; if ((s = next_token()) != NULL) { e = strchr(s + 1, '"'); if ((*s == '"') && e && (*e == '"') && !e[1]) { if (next_token() != NULL) { error_msg("illegal trailing text: %s", pos); } *e = 0; ++s; if (cur_base && !strcmp(cur_base->name,s)) { /* verbose_msg(VDETAIL, "skipping copy of base file %s\n", s); */ return; } /* verbose_msg(VDETAIL, "full copy of %s\n", s); */ pushfile(s); return; } } error_msg("illegal or missing arg for copy: %s", s); } static void do_colsym(void) { char *s; char *e; if ((s = next_token()) != NULL) { e = strrchr(s,'>'); if ((*s == '<') && e && (*e == '>') && !e[1]) { if (next_token() != NULL) { error_msg("illegal trailing text: %s", pos); } e[1] = 0; /* cleanup in case next_token stored something */ add_colitem(s,NULL); return; } } error_msg("illegal or missing arg for collating-symbol: %s", s); } static void do_colele(void) { char *s; char *e; char *s1; char *e1; int n; if ((s = next_token()) != NULL) { e = strrchr(s,'>'); if ((*s == '<') && e && (*e == '>') && !e[1]) { if (((s1 = next_token()) == NULL) || (strcmp(s1,"from") != 0) || ((s1 = next_token()) == NULL) || (*s1 != '\"') ) { error_msg("illegal format for collating-element spec"); } e1 = strchr(s1 + 1, '"'); if ((*s1 != '"') || !e1 || (*e1 != '"') || (e1[1] != 0)) { error_msg("illegal definition for collating-element: %s", s1); } if (next_token() != NULL) { error_msg("illegal trailing text: %s", pos); } e[1] = 0; /* cleanup in case next_token stored something */ e1[1] = 0; add_colitem(s,s1); ++s1; if (!(n = is_ucode(s1))) { error_msg("starting char must be a <U####> code: %s", s1); } assert(s1[n] == '<'); s1[n] = 0; s = xsymdup(s1); if (!(tsearch(s, &cur_base->root_starter_char, sym_cmp))) { error_msg("OUT OF MEMORY"); } return; } } error_msg("illegal or missing arg for collating-element: %s", s); } static ll_item_t *find_section_list_item(const char *name, col_locale_t *loc) { ll_item_t *p; if (!loc) { return NULL; } p = loc->section_list; while (p) { /* if (!((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER))) { */ /* verbose_msg(VDETAIL, "fsli = %d\n", p->data_type); */ /* } */ assert((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER)); if (!strcmp(name, ((section_t *)(p->data))->name)) { break; } p = p->next; } return p; } static ll_item_t *find_ll_last(ll_item_t *p) { assert(p); while (p->next) { p = p->next; } return p; } static void do_script(void) { char *s; char *e; if ((s = next_token()) != NULL) { e = strrchr(s,'>'); if ((*s == '<') && e && (*e == '>') && !e[1]) { if (next_token() != NULL) { error_msg("illegal trailing text: %s", pos); } e[1] = 0; /* cleanup in case next_token stored something */ add_script(s); return; } } error_msg("illegal or missing arg for script: %s", s); } static col_locale_t *new_col_locale(char *name) { ll_item_t *lli; ll_item_t *lli2; cur_col = (col_locale_t *) xmalloc(sizeof(col_locale_t)); cur_col->name = name; cur_col->root_colitem = NULL; cur_col->root_element = NULL; cur_col->root_scripts = NULL; cur_col->base_locale = NULL; if (!superset) { /* start with an anonymous section */ cur_section = new_section(NULL); cur_col->section_list = new_ll_item(DT_SECTION, cur_section); } else { /* start with a reorder section */ cur_section = new_section("R"); cur_num_weights = cur_section->num_rules = ((section_t *)(cur_base->section_list->data))->num_rules; memcpy(cur_rule, ((section_t *)(cur_base->section_list->data))->rules, MAX_COLLATION_WEIGHTS); memcpy(cur_section->rules, ((section_t *)(cur_base->section_list->data))->rules, MAX_COLLATION_WEIGHTS); cur_col->section_list = new_ll_item(DT_REORDER, cur_section); assert(cur_base->section_list->next == NULL); /* currently only one section allowed */ lli = ((section_t *)(cur_base->section_list->data))->itm_list; assert(lli); lli2 = new_ll_item(DT_REORDER, cur_section); lli2->prev = lli2->next = lli2; insque(lli2, lli->prev); ((section_t *)(cur_base->section_list->data))->itm_list = lli2; } /* cur_col->section_list = NULL; */ /* add_script(((section_t *)(cur_col->section_list->data))->name); */ cur_col->root_wi_index = NULL; cur_col->root_wi_index_reordered = NULL; cur_col->root_derived_wi = NULL; cur_col->derived_list = NULL; cur_col->root_starter_char = NULL; cur_col->root_starter_all = NULL; cur_col->undefined_idx = NULL; return cur_col; } static int colitem_cmp(const void *n1, const void *n2) { return strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string); } static int colelement_cmp(const void *n1, const void *n2) { int r; r = strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string); if (!r) { if (((colitem_t *)n1)->element && ((colitem_t *)n2)->element) { r = strcmp(((colitem_t *)n1)->element, ((colitem_t *)n2)->element); } else if (((colitem_t *)n1)->element == ((colitem_t *)n2)->element) { r = 0; /* both null */ } else { r = (((colitem_t *)n1)->element == NULL) ? -1 : 1; } } return r; } static void del_colitem(colitem_t *p) { /* free((void *) p->element); */ /* free((void *) p->string); */ free(p); } static colitem_t *new_colitem(char *item, char *def) { colitem_t *p; p = xmalloc(sizeof(colitem_t)); p->string = xsymdup(item); p->element = (!def) ? def : xsymdup(def); return p; } static void add_colitem(char *item, char *def) { colitem_t *p; #if 0 printf("adding collation item %s", item); if (def) { printf(" with definition %s", def); } printf("\n"); #endif p = new_colitem(item, def); if (superset) { if (tfind(p, &cur_base->root_colitem, colitem_cmp)) { /* verbose_msg(VDETAIL, "skipping superset duplicate collating item \"%s\"\n", p->string); */ del_colitem(p); return; /* } else { */ /* verbose_msg(VDETAIL, "superset: new collating item \"%s\" = %s\n", p->string, p->element); */ } } if (cur_col == cur_derived) { if (!tfind(p, &cur_base->root_colitem, colitem_cmp)) { /* not in current but could be in base */ if (!tsearch(p, &cur_base->root_colitem, colitem_cmp)) { error_msg("OUT OF MEMORY!"); } } else if (!tfind(p, &cur_base->root_colitem, colelement_cmp)) { error_msg("collating element/symbol mismatch: item=%s def=%s", item, def); } } if (!tfind(p, &cur_col->root_colitem, colitem_cmp)) { /* not in current but could be in base */ if (!tsearch(p, &cur_col->root_colitem, colitem_cmp)) { error_msg("OUT OF MEMORY!"); } } else if (!tfind(p, &cur_col->root_colitem, colelement_cmp)) { error_msg("collating element/symbol mismatch"); } else { /* already there */ fprintf(stderr, "duplicate collating item \"%s\"\n", p->string); del_colitem(p); } } /* add a script (section) to the current locale */ static void add_script(const char *s) { ll_item_t *l; /* make sure it isn't in base if working with derived */ if (cur_base != cur_col) { if (find_section_list_item(s, cur_base)) { error_msg("attempt to add script %s for derived when already in base", s); } } if (find_section_list_item(s, cur_col)) { error_msg("attempt to readd script %s", s); } l = find_ll_last(cur_col->section_list); insque(new_ll_item(DT_SECTION, new_section(s)), l); } static const char str_forward[] = "forward"; static const char str_backward[] = "backward"; static const char str_position[] = "position"; static void do_order_start(void) { const char *s; char *e; ll_item_t *l; section_t *sect; int rule; if (order_state & ~IN_ORDER) { error_msg("order_start following reorder{_sections}_after"); } order_state |= IN_ORDER; if (superset) { if (++superset_order_start_cnt > 1) { error_msg("currently only a common order_start is supported in superset"); } return; } if (!(s = next_token())) { s = str_forward; /* if no args */ } if (*s == '<') { /* section (script) */ e = strrchr(s,'>'); if ((*s == '<') && e && (*e == '>') && !e[1]) { e[1] = 0; /* cleanup in case next_token stored something */ if (!(l = find_section_list_item(s, cur_col))) { error_msg("ref of undefined sections: %s", s); } sect = (section_t *)(l->data); if (sect->num_rules) { error_msg("sections already defined: %s", s); } } else { error_msg("illegal section ref: %s", s); } if (!(s = next_token())) { s = str_forward; /* if no args */ } else if (*s != ';') { error_msg("missing seperator!"); } } else { /* need an anonymous section */ if ((*cur_section->name != '<') && (cur_section->num_items == 0)) { /* already in an empty anonymous section */ sect = cur_section; /* fprintf(stdout, "using empty anon section %s\n", sect->name); */ } else { sect = new_section(NULL); l = find_ll_last(cur_col->section_list); insque(new_ll_item(DT_SECTION, sect), l); /* fprintf(stdout, "adding order section after section %s\n", ((section_t *)(l->data))->name); */ /* fprintf(stdout, " last section is %s\n", ((section_t *)(l->next->data))->name); */ } sect->num_rules = 0; /* setting this below so nix default */ } cur_section = sect; /* fprintf(stdout, "cur_section now %s\n", cur_section->name); */ /* now do rules */ do { rule = 0; if (*s == ';') { ++s; } while (*s) { if (!strncmp(str_forward, s, 7)) { rule |= R_FORWARD; s += 7; } else if (!strncmp(str_backward, s, 8)) { rule |= R_BACKWARD; s += 8; } else if (!strncmp(str_position, s, 8)) { rule |= R_POSITION; s += 8; } if (*s == ',') { ++s; continue; } if (!*s || (*s == ';')) { if (sect->num_rules >= MAX_COLLATION_WEIGHTS) { error_msg("more than %d weight rules!", MAX_COLLATION_WEIGHTS); } if (!rule) { error_msg("missing weight rule!"); } if ((rule & (R_FORWARD|R_BACKWARD|R_POSITION)) > R_BACKWARD) { error_msg("backward paired with forward and/or position!"); } sect->rules[sect->num_rules++] = rule; rule = 0; continue; } error_msg("illegal weight rule: %s", s); } } while ((s = next_token()) != NULL); cur_section = sect; /* verbose_msg(VDETAIL, "setting cur_num_weights to %d for %s\n", sect->num_rules, sect->name); */ cur_num_weights = sect->num_rules; memcpy(cur_rule, sect->rules, MAX_COLLATION_WEIGHTS); } static void do_order_end(void) { if (!(order_state & IN_ORDER)) { error_msg("order_end with no matching order_start"); } order_state &= ~IN_ORDER; cur_section = new_section(NULL); } static void do_reorder_after(void) { char *t; ll_item_t *lli; const weight_t *w; int save_cur_num_weights; char save_cur_rule[MAX_COLLATION_WEIGHTS]; if (order_state & ~IN_REORDER) { error_msg("reorder_after following order_start or reorder_sections_after"); } order_state |= IN_REORDER; if (superset) { error_msg("currently reorder_after is not supported in supersets"); } if (!(t = next_token())) { error_msg("missing arg for reorder_after"); } t = xsymdup(t); if (next_token() != NULL) { error_msg("trailing text reorder_after: %s", pos); } if (cur_col == cur_base) { error_msg("sorry.. reorder_after in base locale is not currently supported"); } if (!(lli = find_wi_index(t, cur_base))) { error_msg("reorder_after for non-base item currently not supported: %s", t); } w = ((weighted_item_t *)(lli->data))->weight; save_cur_num_weights = cur_num_weights; memcpy(save_cur_rule, cur_rule, MAX_COLLATION_WEIGHTS); cur_section = new_section("R"); insque(new_ll_item(DT_REORDER, cur_section), lli); #if 0 { ll_item_t *l1; ll_item_t *l2; ll_item_t *l3; l1 = new_ll_item(DT_REORDER, cur_section); l2 = find_ll_last(cur_col->section_list); insque(l1, l2); l3 = find_ll_last(cur_col->section_list); verbose_msg(VDETAIL, "reorder_after %p %p %p %s\n", l1, l2, l3, cur_section->name); } #else insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list)); #endif cur_num_weights = cur_section->num_rules = save_cur_num_weights; memcpy(cur_rule, save_cur_rule, MAX_COLLATION_WEIGHTS); memcpy(cur_section->rules, save_cur_rule, MAX_COLLATION_WEIGHTS); /* verbose_msg(VDETAIL, "reorder -- %s %d\n", ((weighted_item_t *)(lli->data))->symbol, w->num_weights); */ /* if (!w->num_weights) { */ /* } else { */ /* cur_num_weights = w->num_weights; */ /* memcpy(cur_rule, w->rule, MAX_COLLATION_WEIGHTS); */ /* } */ /* verbose_msg(VDETAIL, "reorder_after succeeded for %s\n", t); */ } static void do_reorder_end(void) { if (!(order_state & IN_REORDER)) { error_msg("reorder_end with no matching reorder_after"); } order_state &= ~IN_REORDER; } static void do_reorder_sections_after(void) { const char *t; ll_item_t *lli; if (order_state & ~IN_REORDER_SECTIONS) { error_msg("reorder_sections_after following order_start or reorder_after"); } order_state |= IN_REORDER_SECTIONS; if (superset) { error_msg("currently reorder_sections_after is not supported in supersets"); } if (!(t = next_token())) { error_msg("missing arg for reorder_sections_after"); } t = xsymdup(t); if (next_token() != NULL) { error_msg("trailing text reorder_sections_after: %s", pos); } if (cur_col == cur_base) { error_msg("sorry.. reorder_sections_after in base locale is not currently supported"); } lli = cur_base->section_list; do { /* verbose_msg(VDETAIL, "hmm -- |%s|%d|\n", ((section_t *)(lli->data))->name, lli->data_type); */ if (lli->data_type & DT_SECTION) { /* verbose_msg(VDETAIL, "checking |%s|%s|\n", ((section_t *)(lli->data))->name, t); */ if (!strcmp(((section_t *)(lli->data))->name, t)) { reorder_section_ptr = lli; return; } } lli = lli->next; } while (lli); error_msg("reorder_sections_after for non-base item currently not supported: %s", t); } static void do_reorder_sections_end(void) { if (!(order_state & IN_REORDER_SECTIONS)) { error_msg("reorder_sections_end with no matching reorder_sections_after"); } order_state &= ~IN_REORDER_SECTIONS; reorder_section_ptr = NULL; } static ll_item_t *new_ll_item(int data_type, void *data) { ll_item_t *p; p = xmalloc(sizeof(ll_item_t)); p->next = p->prev = NULL; p->data_type = data_type; p->data = data; p->idx = INT_MIN; return p; } static int sym_cmp(const void *n1, const void *n2) { /* verbose_msg(VDETAIL, "sym_cmp: |%s| |%s|\n", (const char *)n1, (const char *)n2); */ return strcmp((const char *) n1, (const char *) n2); } static char *xsymdup(const char *s) { void *p; if (!(p = tfind(s, &root_sym, sym_cmp))) { /* not a currently known symbol */ if (!(s = strdup(s)) || !(p = tsearch(s, &root_sym, sym_cmp))) { error_msg("OUT OF MEMORY!"); } ++num_sym; mem_sym += strlen(s) + 1; /* verbose_msg(VDETAIL, "xsymdup: alloc |%s| %p |%s| %p\n", *(char **)p, p, s, s); */ /* } else { */ /* verbose_msg(VDETAIL, "xsymdup: found |%s| %p\n", *(char **)p, p); */ } return *(char **) p; } static int weight_cmp(const void *n1, const void *n2) { const weight_t *w1 = (const weight_t *) n1; const weight_t *w2 = (const weight_t *) n2; int i, r; if (w1->num_weights != w2->num_weights) { return w1->num_weights - w2->num_weights; } for (i=0 ; i < w1->num_weights ; i++) { if (w1->rule[i] != w2->rule[i]) { return w1->rule[i] - w2->rule[i]; } if ((r = strcmp(w1->colitem[i], w2->colitem[i])) != 0) { return r; } } return 0; } static weight_t *register_weight(weight_t *w) { void *p; if (!(p = tfind(w, &root_weight, weight_cmp))) { /* new weight */ p = xmalloc(sizeof(weight_t)); memcpy(p, w, sizeof(weight_t)); if (!(p = tsearch(p, &root_weight, weight_cmp))) { error_msg("OUT OF MEMORY!"); } ++unique_weights; /* } else { */ /* verbose_msg(VDETAIL, "rw: found\n"); */ } return *(weight_t **)p; } static size_t ll_len(ll_item_t *l) { size_t n = 0; ll_item_t *p = l; while (p) { ++n; p = p->next; if (p == l) { /* work for circular too */ break; } } return n; } static size_t ll_count(ll_item_t *l, int mask) { size_t n = 0; ll_item_t *p = l; while (p) { if (p->data_type & mask) { ++n; } p = p->next; if (p == l) { /* work for circular too */ break; } } return n; } static int wi_index_cmp(const void *n1, const void *n2) { const char *s1 = ((weighted_item_t *)(((ll_item_t *) n1)->data))->symbol; const char *s2 = ((weighted_item_t *)(((ll_item_t *) n2)->data))->symbol; return strcmp(s1, s2); } static void add_wi_index(ll_item_t *l) { assert(l->data_type == DT_WEIGHTED); if (!strcmp(((weighted_item_t *)(l->data))->symbol, "UNDEFINED")) { cur_col->undefined_idx = l; } if (!tfind(l, &cur_col->root_wi_index, wi_index_cmp)) { /* new wi_index */ if (!tsearch(l, &cur_col->root_wi_index, wi_index_cmp)) { error_msg("OUT OF MEMORY!"); } } if (cur_base != cur_col) { if (!tfind(l, &cur_base->root_wi_index, wi_index_cmp)) {/* not a base val */ /* printf("derived: %s\n", ((weighted_item_t *)(l->data))->symbol); */ if (!tfind(l, &cur_base->root_derived_wi, wi_index_cmp)) { /* new derived */ if (!tsearch(l, &cur_base->root_derived_wi, wi_index_cmp)) { error_msg("OUT OF MEMORY!"); } } } } } static int final_index; static int is_ucode(const char *s) { if ((s[0] == '<') && (s[1] == 'U') && isxdigit(s[2]) && isxdigit(s[3]) && isxdigit(s[4]) && isxdigit(s[5]) && (s[6] == '>') ) { return 7; } else { return 0; } } static void add_final_col_index(const char *s) { ENTRY e; e.key = (char *) s; e.data = (void *)(final_index); if (!hsearch(e, FIND)) { /* not in the table */ if (!hsearch(e, ENTER)) { error_msg("OUT OF MEMORY! (hsearch)"); } #if 0 { int n; void *v; colitem_t ci; colitem_t *p; const char *t; if (!strcmp(s, "UNDEFINED")) { printf("%6d: %s\n", final_index, s); } else { assert(*s == '<'); if ((n = is_ucode(s)) != 0) { assert(!s[n]); printf("%6d: %s\n", final_index, s); } else { ci.string = (char *) s; ci.element = NULL; /* don't care */ v = tfind(&ci, &cur_base->root_colitem, colitem_cmp); if (!v) { verbose_msg(VDETAIL, "%s NOT DEFINED!!!\n", s); } else { p = *((colitem_t **) v); if (p->element != NULL) { t = p->element; assert(*t == '"'); ++t; n = is_ucode(t); assert(n); printf("%6d: %.*s | ", final_index, n, t); do { t += n; assert(*t); if (*t == '"') { assert(!t[1]); break; } n = is_ucode(t); assert(n); printf("%.*s", n, t); } while (1); printf(" collating-element %s\n", s); } else { printf("%6d: %s (collating-symbol)\n", final_index, s); } } } } } #endif ++final_index; } } static int final_index_val0(const char *s) { ENTRY *p; ENTRY e; e.key = (char *) s; if (!(p = hsearch(e, FIND))) { /* not in the table */ return 0; } return (int)(p->data); } static int final_index_val(const char *s) { ENTRY *p; ENTRY e; e.key = (char *) s; if (!(p = hsearch(e, FIND))) { /* not in the table */ error_msg("can't find final index: %s", s); } return (int)(p->data); } static size_t num_tree_nodes; static void count_nodes(const void *ptr, VISIT order, int level) { if ((order == postorder) || (order == leaf)) { ++num_tree_nodes; } } static size_t tnumnodes(const void *root) { num_tree_nodes = 0; twalk(root, count_nodes); return num_tree_nodes; } static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl) { weighted_item_t w; ll_item_t l; void *p; w.symbol = sym; l.data = &w; l.data_type = DT_WEIGHTED; p = tfind(&l, &cl->root_wi_index, wi_index_cmp); if (p) { p = *(ll_item_t **)p; } return (ll_item_t *) p; } static void mark_reordered(const char *sym) { ll_item_t *lli; lli = find_wi_index(sym, cur_base); if (lli) { if (!tsearch(lli, &cur_base->root_wi_index_reordered, wi_index_cmp)) { error_msg("OUT OF MEMORY!"); } } } static ll_item_t *find_wi_index_reordered(const char *sym) { weighted_item_t w; ll_item_t l; void *p; w.symbol = sym; l.data = &w; l.data_type = DT_WEIGHTED; p = tfind(&l, &cur_base->root_wi_index_reordered, wi_index_cmp); if (p) { p = *(ll_item_t **)p; } return (ll_item_t *) p; } static ll_item_t *init_comm_ptr(void) { assert(cur_base); assert(cur_base->section_list); /* at the moment, only support one section in comm */ assert(cur_base->section_list->next == NULL); comm_cur_ptr = ((section_t *)(cur_base->section_list->data))->itm_list; while (comm_cur_ptr && (comm_cur_ptr->data_type & DT_REORDER)) { comm_cur_ptr = comm_cur_ptr->next; } /* { */ /* ll_item_t *p = comm_cur_ptr; */ /* verbose_msg(VDETAIL, "init_comm_ptr\n"); */ /* while (p != comm_cur_ptr) { */ /* if (p->data_type & DT_WEIGHTED) { */ /* verbose_msg(VDETAIL, "%s", ((weighted_item_t *)p)->symbol); */ /* } */ /* p = p->next; */ /* } */ /* } */ assert(comm_cur_ptr); /* verbose_msg(VDETAIL, "init_comm_ptr -- %s %p %p %p %d\n", */ /* ((weighted_item_t *)(comm_cur_ptr->data))->symbol, */ /* comm_cur_ptr, comm_cur_ptr->prev, comm_cur_ptr->next, */ /* ll_len(comm_cur_ptr)); */ comm_prev_ptr = NULL; return comm_cur_ptr; } static ll_item_t *next_comm_ptr(void) { /* at the moment, only support one section in comm */ assert(cur_base->section_list->next == NULL); comm_prev_ptr = comm_cur_ptr; while (comm_cur_ptr && ((comm_cur_ptr = comm_cur_ptr->next) != NULL)) { if (!(comm_cur_ptr->data_type & DT_REORDER)) { break; } } return comm_cur_ptr; } static int dump_count; #if 0 static void dump_section(section_t *s, int mask, col_locale_t *der) { ll_item_t *lli; ll_item_t *lli0; weighted_item_t *w; weight_t *p; int i; lli0 = lli = s->itm_list; if (!lli0) { return; } do { if (!(lli->data_type & mask)) { lli = lli->next; continue; } if (lli->data_type & DT_WEIGHTED) { ++dump_count; w = (weighted_item_t *)(lli->data); p = w->weight; printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights); for (i = 0 ; i < p->num_weights ; i++) { if (p->rule[i] & R_FORWARD) { printf("F"); } if (p->rule[i] & R_BACKWARD) { printf("B"); } if (p->rule[i] & R_POSITION) { printf("P"); } printf(","); } for (i = 0 ; i < p->num_weights ; i++) { printf(" %s", p->colitem[i]); } printf("\n"); } else if (lli->data_type & (DT_SECTION|DT_REORDER)) { if (lli->data_type == DT_REORDER) { assert(der); if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) { lli = lli->next; continue; } } if (lli->data_type & DT_SECTION) { printf("SECTION -----------------\n"); } else { printf("REORDER -----------------\n"); } dump_section((section_t *)(lli->data), mask, der); printf("DONE --------------------\n"); } lli = lli->next; } while (lli != lli0); } #else static int in_reorder_section = 0; static void dump_section(section_t *s, int mask, col_locale_t *der) { ll_item_t *lli; ll_item_t *lli0; weighted_item_t *w; weight_t *p; int i; lli0 = lli = s->itm_list; if (!lli0) { return; } do { if (!(lli->data_type & mask)) { lli = lli->next; continue; } if (lli->data_type & DT_WEIGHTED) { ++dump_count; w = (weighted_item_t *)(lli->data); p = w->weight; #if 1 if (in_reorder_section) { printf(" %p", w); } #else printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights); for (i = 0 ; i < p->num_weights ; i++) { if (p->rule[i] & R_FORWARD) { printf("F"); } if (p->rule[i] & R_BACKWARD) { printf("B"); } if (p->rule[i] & R_POSITION) { printf("P"); } printf(","); } for (i = 0 ; i < p->num_weights ; i++) { printf(" %s", p->colitem[i]); } printf("\n"); #endif } else if (lli->data_type & (DT_SECTION|DT_REORDER)) { if (lli->data_type == DT_REORDER) { assert(der); if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) { lli = lli->next; continue; } } if (lli->data_type & DT_SECTION) { /* printf("SECTION -----------------\n"); */ assert(0); } else { /* printf("REORDER -----------------\n"); */ in_reorder_section = 1; } dump_section((section_t *)(lli->data), mask, der); /* printf("DONE --------------------\n"); */ printf("\n"); in_reorder_section = 0; } lli = lli->next; } while (lli != lli0); } #endif static void dump_weights(const char *name) { ll_item_t *lli; col_locale_t *base; col_locale_t *der; col_locale_t cl; void *p; assert(name); if (!*name) { /* use last */ base = cur_base; der = cur_derived; } else { cl.name = (char *) name; if (!(p = tfind(&cl, &root_col_locale, col_locale_cmp))) { error_msg("unknown locale: %s", name); } base = *((col_locale_t **) p); der = NULL; if (base->base_locale) { /* oops... really derived */ der = base; base = der->base_locale; } } dump_count = 0; if (base) { /* printf("BASE - %s\n", base->name); */ for (lli = base->section_list ; lli ; lli = lli->next) { /* printf("SECTION %s\n", ((section_t *)(lli->data))->name); */ dump_section((section_t *)(lli->data), ~0, der); } } assert(der != base); if (der) { /* printf("DERIVED - %s\n", der->name); */ for (lli = der->section_list ; lli ; lli = lli->next) { if (lli->data_type == DT_SECTION) { dump_section((section_t *)(lli->data), DT_WEIGHTED, der); } } } /* printf("DONE\n"); */ } static void print_starter_node(const void *ptr, VISIT order, int level) { if (order == postorder || order == leaf) { fprintf(stderr, " %s\n", *(const char **) ptr); } } static void finalize_base(void) { ll_item_t *s; ll_item_t *h; ll_item_t *lli; ll_item_t *h2; ll_item_t *l2; ll_item_t *cli; ll_item_t *rli = NULL; weighted_item_t *w; weight_t *p; int i, n, mr, r, mi; col_locale_t *cl; void *mm; int num_invariant = 0; int num_varying = 0; int max_weight; int index2weight_len_inc = 1; assert(cur_base); assert(base_locale_len+1 < BASE_LOCALE_LEN); base_locale_array[base_locale_len].name = cur_base->name; base_locale_array[base_locale_len].num_weights = 1; base_locale_array[base_locale_len].index2weight_offset = index2weight_len; base_locale_array[base_locale_len].index2ruleidx_offset = index2ruleidx_len; if (!strcmp(cur_base->name,"ja_JP") || !strcmp(cur_base->name,"ko_KR")) { index2weight_len_inc = 0; } /* printf("%s -- index2weight_len = %d\n", cur_base->name, index2weight_len); */ if (!hcreate(30000)) { error_msg("OUT OF MEMORY!"); } /* first pass ... set the fixed indexes */ final_index = i = 1; mr = 0; for (s = cur_base->section_list ; s ; s = s->next) { #if 1 if (s->data_type & DT_REORDER) { /* a reordered section */ verbose_msg(VDETAIL, "pass1: reordered section %s - xxx\n", ((section_t *)(s->data))->name); lli = ((section_t *)(s->data))->itm_list; r = 0; if (lli) { /* r = ll_len( ((section_t *)(lli->data))->itm_list ); */ r = ll_len(lli) + 1; } if (r > mr) { mr = r; } verbose_msg(VDETAIL, "pass1: reordered section %s - %d\n", ((section_t *)(s->data))->name, r); continue; } #endif h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { if (lli->data_type & DT_RANGE) { i += mr; mr = 0; /* ++i; */ lli->idx = i; assert(!rli); rli = lli; verbose_msg(VDETAIL, "range pre = %d after = ", i); i += ((range_item_t *)(lli->data))->length + 1; /* ++i; */ verbose_msg(VDETAIL, "%d\n", i); if (!index2weight_len_inc) { /* ko_KR hack */ final_index += ((range_item_t *)(lli->data))->length + 1; } /* add_final_col_index("RANGE"); */ } else if (lli->data_type & DT_WEIGHTED) { i += mr; mr = 0; w = (weighted_item_t *)(lli->data); if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ ++num_varying; ++i; continue; } ++num_invariant; index2weight_buffer[index2weight_len] = lli->idx = i++; index2weight_len += index2weight_len_inc; add_final_col_index(w->symbol); } else { assert(lli->data_type & DT_REORDER); r = ll_len( ((section_t *)(lli->data))->itm_list ); if (r > mr) { mr = r; } /* r = 0; */ } } while ((lli = lli->next) != h); } /* second pass ... set the reordered indexes */ mi = i + mr; mr = i = 0; for (s = cur_base->section_list ; s ; s = s->next) { h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { if (lli->data_type & DT_RANGE) { i += mr; mr = 0; i = lli->idx + ((range_item_t *)(lli->data))->length + 1; } else if ((lli->data_type & DT_WEIGHTED) && !(s->data_type & DT_REORDER)) { i += mr; mr = 0; w = (weighted_item_t *)(lli->data); if (find_wi_index_reordered(w->symbol) /* reordered symbol skipped on first pass */ #if 0 || (s->data_type & DT_REORDER) /* or in a reordered section */ #endif ) { assert(!(s->data_type & DT_REORDER)); index2weight_buffer[index2weight_len] = lli->idx = ++i; index2weight_len += index2weight_len_inc; add_final_col_index(w->symbol); /* fprintf(stdout, "%11s: r %6d %6d %s\n", */ /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ continue; } i = lli->idx; /* fprintf(stdout, "%11s: w %6d %6d %s\n", */ /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ } else { /* verbose_msg(VDETAIL, "section: %s %d %d\n", ((section_t *)(s->data))->name, */ /* s->data_type, lli->data_type); */ /* assert(!(s->data_type & DT_REORDER)); */ /* assert(lli->data_type & DT_REORDER); */ #if 1 if (s->data_type & DT_REORDER) { h2 = l2 = lli; if (!h2) { continue; } } else { assert(s->data_type & DT_SECTION); h2 = l2 = ((section_t *)(lli->data))->itm_list; if (!h2) { continue; } } #else h2 = l2 = ((section_t *)(lli->data))->itm_list; if (!h2) { continue; } #endif r = 0; do { assert(l2->data_type & DT_WEIGHTED); ++r; l2->idx = i + r; /* fprintf(stdout, "%s: R %6d %s\n", */ /* ((section_t *)(lli->data))->name, l2->idx, ((weighted_item_t *)(l2->data))->symbol); */ } while ((l2 = l2->next) != h2); if (r > mr) { mr = r; } } } while ((lli = lli->next) != h); } /* finally, walk through all derived locales and set non-reordered section items */ mr = mi; for (cli = cur_base->derived_list ; cli ; cli = cli->next) { cl = (col_locale_t *)(cli->data); /* verbose_msg(VDETAIL, "pass3: %d %s\n", cli->data_type, cl->name); */ /* fprintf(stdout, "pass3: %d %s\n", cli->data_type, cl->name); */ assert(cli->data_type == DT_COL_LOCALE); i = mi; for (s = cl->section_list ; s ; s = s->next) { /* if (s->data_type & DT_REORDER) { */ /* continue; */ /* } */ h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { assert(!(lli->data_type & DT_RANGE)); if (lli->data_type & DT_WEIGHTED) { /* verbose_msg(VDETAIL, " %d %d %s\n", lli->data_type, lli->idx, ((weighted_item_t *)(lli->data))->symbol); */ add_final_col_index(((weighted_item_t *)(lli->data))->symbol); if (s->data_type & DT_REORDER) { continue; } assert(lli->idx == INT_MIN); lli->idx = ++i; /* fprintf(stdout, "%11s: S %6d %6d %s\n", */ /* cl->name, lli->idx, */ /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */ /* ((weighted_item_t *)(lli->data))->symbol); */ } else { assert(0); assert(lli->data_type & DT_SECTION); h2 = l2 = ((section_t *)(lli->data))->itm_list; if (!h2) { continue; } do { assert(l2->data_type & DT_WEIGHTED); assert(l2->idx == INT_MIN); l2->idx = ++i; add_final_col_index(((weighted_item_t *)(l2->data))->symbol); } while ((l2 = l2->next) != h2); } } while ((lli = lli->next) != h); } if (i > mr) { mr = i; } } max_weight = mr; assert(num_varying == tnumnodes(cur_base->root_wi_index_reordered)); /* we can now initialize the wcs2index array */ { ENTRY *p; ENTRY e; char buf[8]; static const char xd[] = "0123456789ABCDEF"; int starter_index = final_index; int wcs2index_count = 0; strcpy(buf, "<U....>"); memset(wcs2index, 0, sizeof(wcs2index)); e.key = (char *) buf; for (i=1 ; i <= 0xffff ; i++) { buf[5] = xd[ i & 0xf ]; buf[4] = xd[ (i >> 4) & 0xf ]; buf[3] = xd[ (i >> 8) & 0xf ]; buf[2] = xd[ (i >> 12) & 0xf ]; if ((p = hsearch(e, FIND)) != NULL) { ++wcs2index_count; if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { wcs2index[i] = ++starter_index; /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d (starter)\n", i, wcs2index[i]); */ } else { wcs2index[i] = (int)(p->data); /* verbose_msg(VDETAIL, "wcs2index[ %#06x ] = %d\n", i, wcs2index[i]); */ } } else { if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { error_msg("marked starter but not in hash: %s", buf); } } } /* ---------------------------------------------------------------------- */ { int i, n; table_data table; size_t t, smallest; n = 0; smallest = SIZE_MAX; table.ii = NULL; for (i=0 ; i < 14 ; i++) { if ((RANGE >> i) < 4) { break; } t = newopt(wcs2index, RANGE, i, &table); if (smallest >= t) { n = i; smallest = t; /* } else { */ /* break; */ } } /* printf("smallest = %u for range %#x (%u)\n", smallest, RANGE, RANGE); */ assert(smallest != SIZE_MAX); if (smallest + wcs2colidt_len >= WCS2COLIDT_LEN) { error_msg("WCS2COLIDT_LEN too small"); } base_locale_array[base_locale_len].wcs2colidt_offset = wcs2colidt_len; table.ii = wcs2colidt_buffer + wcs2colidt_len; t = smallest; smallest = SIZE_MAX; smallest = newopt(wcs2index, RANGE, n, &table); assert(t == smallest); wcs2colidt_len += smallest; /* verbose_msg(VDETAIL, "smallest = %d wcs2colidt_len = %d\n", smallest, wcs2colidt_len); */ #if 0 { unsigned int sc, n, i0, i1; unsigned int u = 0xe40; table_data *tbl = &table; #define __LOCALE_DATA_WCctype_TI_MASK ((1 << tbl->ti_shift)-1) #define __LOCALE_DATA_WCctype_TI_SHIFT (tbl->ti_shift) #define __LOCALE_DATA_WCctype_TI_LEN (tbl->ti_len) #define __LOCALE_DATA_WCctype_II_MASK ((1 << tbl->ii_shift)-1) #define __LOCALE_DATA_WCctype_II_SHIFT (tbl->ii_shift) #define __LOCALE_DATA_WCctype_II_LEN (tbl->ii_len) sc = u & __LOCALE_DATA_WCctype_TI_MASK; u >>= __LOCALE_DATA_WCctype_TI_SHIFT; n = u & __LOCALE_DATA_WCctype_II_MASK; u >>= __LOCALE_DATA_WCctype_II_SHIFT; i0 = tbl->ii[u]; verbose_msg(VDETAIL, "i0 = %d\n", i0); i0 <<= __LOCALE_DATA_WCctype_II_SHIFT; i1 = tbl->ii[__LOCALE_DATA_WCctype_II_LEN + i0 + n]; /* i1 = tbl->ti[i0 + n]; */ verbose_msg(VDETAIL, "i1 = %d\n", i1); i1 <<= __LOCALE_DATA_WCctype_TI_SHIFT; /* return *(uint16_t *)(&(tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc])); */ verbose_msg(VDETAIL, "i2 = %d\n", __LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc); verbose_msg(VDETAIL, "val = %d\n", tbl->ii[__LOCALE_DATA_WCctype_II_LEN + __LOCALE_DATA_WCctype_TI_LEN + i1 + sc]); /* return tbl->ut[i1 + sc]; */ } #endif base_locale_array[base_locale_len].ii_shift = table.ii_shift; base_locale_array[base_locale_len].ti_shift = table.ti_shift; base_locale_array[base_locale_len].ii_len = table.ii_len; base_locale_array[base_locale_len].ti_len = table.ti_len; } /* ---------------------------------------------------------------------- */ base_locale_array[base_locale_len].num_col_base = num_invariant + num_varying; base_locale_array[base_locale_len].max_col_index = final_index; base_locale_array[base_locale_len].max_weight = max_weight; verbose_msg(VDETAIL, "%s: %6u invariant %6u varying %6u derived %6u total %6u max weight %6u wcs2\n", cur_base->name, num_invariant, num_varying, tnumnodes(cur_base->root_derived_wi), final_index, max_weight, wcs2index_count); } #if 1 /* ok, now we need to dump out the base and derived tables... */ /* don't forget to break up collating elements!!! */ /* fprintf(stdout, "**************************************************\n"); */ /* first pass ... set the invariants */ for (s = cur_base->section_list ; s ; s = s->next) { #if 1 if (s->data_type & DT_REORDER) { verbose_msg(VDETAIL, "1: skipping reordered section %s\n", ((section_t *)(s->data))->name); continue; } #endif h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { if (lli->data_type & DT_WEIGHTED) { w = (weighted_item_t *)(lli->data); if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ continue; } if (index2weight_len_inc) { index2ruleidx_buffer[index2ruleidx_len++] = add_rule((weighted_item_t *)(lli->data)); } /* fprintf(stdout, "%11s: w %6d %6d %s\n", */ /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ } } while ((lli = lli->next) != h); } /* second pass ... set varying */ for (s = cur_base->section_list ; s ; s = s->next) { #if 1 if (s->data_type & DT_REORDER) { verbose_msg(VDETAIL, "2: skipping reordered section %s\n", ((section_t *)(s->data))->name); continue; } #endif h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { if (lli->data_type & DT_WEIGHTED) { w = (weighted_item_t *)(lli->data); if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ if (index2weight_len_inc) { index2ruleidx_buffer[index2ruleidx_len++] = add_rule((weighted_item_t *)(lli->data)); } /* fprintf(stdout, "%11s: r %6d %6d %s\n", */ /* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ continue; } } } while ((lli = lli->next) != h); } do_starter_lists(cur_base); /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */ if (rli) { base_locale_array[base_locale_len].range_low = strtoul(((range_item_t *)(rli->data))->symbol1 + 2, NULL, 16); base_locale_array[base_locale_len].range_count = ((range_item_t *)(rli->data))->length; base_locale_array[base_locale_len].range_base_weight = rli->idx; base_locale_array[base_locale_len].range_rule_offset = add_range_rule((range_item_t *)(rli->data)); /* fprintf(stdout, "%11s: %6d %6d %s %s (%d)\n", */ /* "RANGE", rli->idx, -1, */ /* ((range_item_t *)(rli->data))->symbol1, */ /* ((range_item_t *)(rli->data))->symbol2, */ /* ((range_item_t *)(rli->data))->length); */ } /* fprintf(stdout,"\nDerived\n\n"); */ /* first, if base name is of the form ll_CC, add a derived locale for it */ if ((strlen(cur_base->name) == 5) && islower(cur_base->name[0]) && islower(cur_base->name[1]) && (cur_base->name[2] == '_') && isupper(cur_base->name[3]) && isupper(cur_base->name[4]) ) { verbose_msg(VDETAIL, "adding special derived for %s\n", cur_base->name); /* verbose_msg(VDETAIL,"updated final_index = %d\n", final_index); */ assert(der_locale_len+1 < DER_LOCALE_LEN); der_locale_array[der_locale_len].name = cur_base->name; der_locale_array[der_locale_len].base_idx = base_locale_len; u16_buf[0] = 1; u16_buf[1] = 0; u16_buf_len = 2; mm = NULL; if ((u16_buf_len > override_len) || !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]), u16_buf, u16_buf_len*sizeof(u16_buf[0]))) ) { assert(override_len + u16_buf_len < OVERRIDE_LEN); memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); der_locale_array[der_locale_len].overrides_offset = override_len; override_len += u16_buf_len; /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */ } else if (!(u16_buf_len > override_len)) { assert(mm); der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer; /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ } der_locale_array[der_locale_len].multistart_offset = base_locale_array[base_locale_len].multistart_offset; der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED"); if (!der_locale_array[der_locale_len].undefined_idx) { error_msg("no UNDEFINED definition for %s", cur_base->name); } ++der_locale_len; } else { verbose_msg(VDETAIL, "NOT adding special derived for %s\n", cur_base->name); } /* now all the derived... */ for (cli = cur_base->derived_list ; cli ; cli = cli->next) { cl = (col_locale_t *)(cli->data); assert(cli->data_type == DT_COL_LOCALE); assert(der_locale_len+1 < DER_LOCALE_LEN); der_locale_array[der_locale_len].name = cl->name; der_locale_array[der_locale_len].base_idx = base_locale_len; u16_buf_len = 0; for (i = 0 ; i < 2 ; i++) { if (i) { /* fprintf(stdout, " section --- (singles)\n"); */ u16_buf[u16_buf_len++] = 1; /* single */ } /* we do this in two passes... first all sequences, then all single reorders */ for (s = cl->section_list ; s ; s = s->next) { /* verbose_msg(VDETAIL, "doing section %s\n", ((section_t *)(s->data))->name); */ h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { /* fprintf(stdout, "EMPTY ITEM LIST IN SECTION %s\n", ((section_t *)(s->data))->name ); */ continue; } assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); if ((!i && (ll_len(h) > 1) ) || (ll_len(h) == i)) { if (!i) { /* fprintf(stdout, " section ----------------- %d %d\n", i, ll_len(h)); */ u16_buf[u16_buf_len++] = ll_len(h); /* multi */ assert(lli->data_type & DT_WEIGHTED); #if 0 u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); /* start index */ #endif u16_buf[u16_buf_len++] = lli->idx; /* start weight */ } do { assert(lli->data_type & DT_WEIGHTED); if (lli->data_type & DT_WEIGHTED) { /* fprintf(stdout, "%11s: S %6d %6d %s\n", */ /* cl->name, lli->idx, */ /* final_index_val(((weighted_item_t *)(lli->data))->symbol), */ /* ((weighted_item_t *)(lli->data))->symbol); */ #if 0 if (i) { assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); assert(u16_buf[u16_buf_len-1]); u16_buf[u16_buf_len++] = lli->idx; /* weight */ } #else assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); assert(u16_buf[u16_buf_len-1]); if (i) { u16_buf[u16_buf_len++] = lli->idx; /* weight */ } #endif u16_buf[u16_buf_len++] = add_rule((weighted_item_t *)(lli->data)); } } while ((lli = lli->next) != h); } } } u16_buf[u16_buf_len++] = 0; mm = NULL; if ((u16_buf_len > override_len) || !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]), u16_buf, u16_buf_len*sizeof(u16_buf[0]))) ) { assert(override_len + u16_buf_len < OVERRIDE_LEN); memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); der_locale_array[der_locale_len].overrides_offset = override_len; override_len += u16_buf_len; /* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */ } else if (!(u16_buf_len > override_len)) { assert(mm); der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer; /* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ } do_starter_lists(cl); der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED"); #if 0 assert(der_locale_array[der_locale_len].undefined_idx); if (!der_locale_array[der_locale_len].undefined_idx) { der_locale_array[der_locale_len].undefined_idx = base_locale_array[base_locale_len].undefined_idx; } #endif if (!der_locale_array[der_locale_len].undefined_idx) { error_msg("no UNDEFINED definition for %s", cl->name); } ++der_locale_len; } #endif /* base_locale_array[base_locale_len].undefined_idx = final_index_val0("UNDEFINED"); */ base_locale_array[base_locale_len].undefined_idx = 0; hdestroy(); ++base_locale_len; /* if (tnumnodes(cur_base->root_starter_char)) { */ /* verbose_msg(VDETAIL, "starter nodes\n"); */ /* twalk(cur_base->root_starter_char, print_starter_node); */ /* } */ } static int starter_all_cmp(const void *n1, const void *n2) { const char *s1 = ((weighted_item_t *) n1)->symbol; const char *s2 = ((weighted_item_t *) n2)->symbol; colitem_t x; colitem_t *p; int n; /* sort by 1st char ... then inverse for string */ x.element = NULL; if (!is_ucode(s1)) { x.string = s1; p = tfind(&x, &cur_base->root_colitem, colitem_cmp); s1 = (*((colitem_t **) p))->element + 1; } if (!is_ucode(s2)) { x.string = s2; p = tfind(&x, &cur_base->root_colitem, colitem_cmp); s2 = (*((colitem_t **) p))->element + 1; } /* <U####>< */ /* 01234567 */ assert(is_ucode(s1)); assert(is_ucode(s2)); n = strncmp(s1+2, s2+2, 4); if (n) { return n; } s1 += 7; s2 += 7; return strcmp(s2, s1); } static void print_starter_all_node(const void *ptr, VISIT order, int level) { const weighted_item_t *w = *(const weighted_item_t **) ptr; colitem_t *ci; void *p; int n; colitem_t x; if (order == postorder || order == leaf) { #if 0 if ((n = is_ucode(w->symbol)) != 0) { printf(" %s\n", w->symbol); } else { x.string = w->symbol; x.element = NULL; p = tfind(&x, &cur_base->root_colitem, colitem_cmp); assert(p); ci = *((colitem_t **) p); printf("%s = %s\n", ci->element, w->symbol); } #else printf("%s|", w->symbol); /* if ((n = is_ucode(w->symbol)) != 0) { */ /* printf("\n"); */ /* } */ #endif } } static void process_starter_node(const void *ptr, VISIT order, int level) { const weighted_item_t *w = *(const weighted_item_t **) ptr; colitem_t *ci; void *p; int n; colitem_t x; const char *s; char buf[32]; /* store index of collation item followed by (unprefixed) nul-terminated string */ if (order == postorder || order == leaf) { if ((n = is_ucode(w->symbol)) != 0) { u16_buf[u16_buf_len++] = final_index_val(w->symbol); assert(u16_buf[u16_buf_len-1]); u16_buf[u16_buf_len++] = 0; if (++u16_starter < base_locale_array[base_locale_len].num_starters) { u16_buf[u16_starter] = u16_buf_len; } /* verbose_msg(VDETAIL, "ucode - %d %d\n", u16_buf[u16_starter-1], u16_buf_len); */ } else { x.string = w->symbol; x.element = NULL; p = tfind(&x, &cur_base->root_colitem, colitem_cmp); assert(p); ci = *((colitem_t **) p); s = ci->element; u16_buf[u16_buf_len++] = final_index_val(w->symbol); assert(u16_buf[u16_buf_len-1]); assert(*s == '"'); n = is_ucode(++s); /* verbose_msg(VDETAIL, "s is |%s| with len %d (%d)\n", s, strlen(s), n); */ assert(n); s += n; while (*s != '"') { n = is_ucode(s); assert(n); strncpy(buf, s, n+1); buf[n] = 0; /* verbose_msg(VDETAIL, "buf is |%s| with len %d (%d)\n", buf, strlen(buf), n); */ u16_buf[u16_buf_len++] = final_index_val(buf); assert(u16_buf[u16_buf_len-1]); s += n; } u16_buf[u16_buf_len++] = 0; } } } static void **p_cl_root_starter_all; static void complete_starter_node(const void *ptr, VISIT order, int level) { weighted_item_t w; weighted_item_t *p; if (order == postorder || order == leaf) { w.symbol = *(const char **) ptr; w.weight = NULL; if (!tfind(&w, p_cl_root_starter_all, starter_all_cmp)) { p = xmalloc(sizeof(weighted_item_t)); p->symbol = w.symbol; p->weight = NULL; /* verbose_msg(VDETAIL, "complete_starter_node: %s\n", *(const char **) ptr); */ if (!tsearch(p, p_cl_root_starter_all, starter_all_cmp)) { error_msg("OUT OF MEMORY"); } } } } static void do_starter_lists(col_locale_t *cl) { ll_item_t *s; ll_item_t *h; ll_item_t *lli; col_locale_t *c; colitem_t *ci; weighted_item_t *w; void *p; char buf[32]; int n; colitem_t x; void *mm; c = cl; if (c != cur_base) { c = cur_base; } /* printf("STARTERS %s --------------------\n", cl->name); */ LOOP: for (s = c->section_list ; s ; s = s->next) { h = lli = ((section_t *)(s->data))->itm_list; if (!lli) { continue; } do { if (lli->data_type & DT_WEIGHTED) { w = (weighted_item_t *)(lli->data); ci = NULL; if ((n = is_ucode(w->symbol)) != 0) { strcpy(buf, w->symbol); } else { /* fprintf(stdout, "looking for |%s|\n", w->symbol); */ x.string = w->symbol; x.element = NULL; p = tfind(&x, &cur_base->root_colitem, colitem_cmp); if (!p) { /* verbose_msg(VDETAIL, "Whoa... processing starters for %s and couldn't find %s\n", */ /* cl->name, w->symbol); */ continue; } ci = *((colitem_t **) p); if (!ci->element) { /* just a collating symbol */ continue; } assert(ci->element[0] == '"'); n = is_ucode(ci->element + 1); assert(n); strncpy(buf, ci->element + 1, n); } if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { /* fprintf(stdout, "adding from %s: %s", c->name, w->symbol); */ /* if (ci) { */ /* fprintf(stdout, " = %s", ci->element); */ /* } */ /* fprintf(stdout, "\n"); */ if (!tsearch(w, &cl->root_starter_all, starter_all_cmp)) { error_msg("OUT OF MEMORY"); } } } } while ((lli = lli->next) != h); } if (c != cl) { c = cl; goto LOOP; } p_cl_root_starter_all = &cl->root_starter_all; twalk(cur_base->root_starter_char, complete_starter_node); if (cl == cur_base) { base_locale_array[base_locale_len].num_starters = tnumnodes(cur_base->root_starter_char); } #if 0 printf("\nNow walking tree...\n\n"); twalk(cl->root_starter_all, print_starter_all_node); printf("\n\n"); #endif u16_starter = 0; u16_buf[0] = u16_buf_len = base_locale_array[base_locale_len].num_starters; twalk(cl->root_starter_all, process_starter_node); /* verbose_msg(VDETAIL, "s=%d n=%d\n", u16_starter, base_locale_array[base_locale_len].num_starters); */ assert(u16_starter == base_locale_array[base_locale_len].num_starters); #if 0 { int i; for (i=0 ; i < u16_buf_len ; i++) { verbose_msg(VDETAIL, "starter %2d: %d - %#06x\n", i, u16_buf[i], u16_buf[i]); }} #endif mm = NULL; if (u16_buf_len) { /* assert(base_locale_array[base_locale_len].num_starters); */ if ((u16_buf_len > multistart_len) || !(mm = memmem(multistart_buffer, multistart_len*sizeof(multistart_buffer[0]), u16_buf, u16_buf_len*sizeof(u16_buf[0]))) ) { assert(multistart_len + u16_buf_len < MULTISTART_LEN); memcpy(multistart_buffer + multistart_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); if (cl == cur_base) { base_locale_array[base_locale_len].multistart_offset = multistart_len; } else { der_locale_array[der_locale_len].multistart_offset = multistart_len; } multistart_len += u16_buf_len; /* verbose_msg(VDETAIL, "%s: multistart_len = %d u16_buf_len = %d\n", cl->name, multistart_len, u16_buf_len); */ } else if (!(u16_buf_len > multistart_len)) { assert(mm); if (cl == cur_base) { base_locale_array[base_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer; } else { der_locale_array[der_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer; } /* verbose_msg(VDETAIL, "%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ } } else { assert(!base_locale_array[base_locale_len].num_starters); } /* printf("u16_buf_len = %d\n", u16_buf_len); */ /* printf("STARTERS %s DONE ---------------\n", cl->name); */ } /* For sorting the blocks of unsigned chars. */ static size_t nu_val; int nu_memcmp(const void *a, const void *b) { return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val * sizeof(tbl_item)); } size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl) { static int recurse; tbl_item *ti[RANGE]; /* table index */ size_t numblocks; size_t blocksize; size_t uniq; size_t i, j; size_t smallest, t; tbl_item *ii_save; int uniqblock[1 << (8*sizeof(tbl_item) - 1)]; tbl_item uit[RANGE]; int shift2; if (shift > 15) { return SIZE_MAX; } ii_save = NULL; blocksize = 1 << shift; numblocks = usize >> shift; /* init table index */ for (i=j=0 ; i < numblocks ; i++) { ti[i] = ut + j; j += blocksize; } /* sort */ nu_val = blocksize; qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp); uniq = 1; uit[(ti[0]-ut)/blocksize] = 0; for (i=1 ; i < numblocks ; i++) { if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) < 0) { if (++uniq > (1 << (8*sizeof(tbl_item) - 1))) { break; } uniqblock[uniq - 1] = i; } #if 1 else if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) > 0) { printf("bad sort %i!\n", i); abort(); } #endif uit[(ti[i]-ut)/blocksize] = uniq - 1; } smallest = SIZE_MAX; shift2 = -1; if (uniq <= (1 << (8*sizeof(tbl_item) - 1))) { smallest = numblocks + uniq * blocksize; if (!recurse) { ++recurse; for (j=1 ; j < 14 ; j++) { if ((numblocks >> j) < 2) break; if (tbl) { ii_save = tbl->ii; tbl->ii = NULL; } if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) { t += uniq * blocksize; } if (tbl) { tbl->ii = ii_save; } if (smallest >= t) { shift2 = j; smallest = t; /* if (!tbl->ii) { */ /* printf("ishift %u tshift %u size %u\n", */ /* shift2, shift, t); */ /* } */ /* } else { */ /* break; */ } } --recurse; } } else { return SIZE_MAX; } if (tbl->ii) { if (recurse) { tbl->ii_shift = shift; tbl->ii_len = numblocks; memcpy(tbl->ii, uit, numblocks*sizeof(tbl_item)); tbl->ti = tbl->ii + tbl->ii_len; tbl->ti_len = uniq * blocksize; for (i=0 ; i < uniq ; i++) { memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item)); } } else { ++recurse; /* printf("setting ishift %u tshift %u\n", shift2, shift); */ newopt(uit, numblocks, shift2, tbl); --recurse; tbl->ti_shift = shift; tbl->ut_len = uniq * blocksize; tbl->ut = tbl->ti + tbl->ti_len; for (i=0 ; i < uniq ; i++) { memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item)); } } } return smallest; } static const int rule2val[8] = { -1, (1 << 14), /* forward */ (2 << 14), /* position */ (3 << 14), /* forward,position */ 0, /* backward */ -1, -1, -1, }; static int final_index_val_x(const char *s, const char *sym) { int r; if (!(r = final_index_val0(s))) { if (!strcmp(s, "IGNORE")) { r = 0; } else if (!strcmp(s, "..") || !strcmp(sym, "RANGE")) { if (*sym == '.') { final_index_val(sym); /* make sure it's known */ } r = 0x3fff; } else if (!strcmp(s, ".")) { r = 0x3ffe; } else { error_msg("can't find final index: %s", s); } } return r; } /* store rule2val in 2 high bits and collation index in lower. * for sort strings, store (offset from base) + max colindex as index. */ static unsigned int add_rule(weighted_item_t *wi) { weight_t *w = wi->weight; int i, j, r, n; uint16_t rbuf[MAX_COLLATION_WEIGHTS]; uint16_t ws_buf[32]; void *mm; char buf[32]; const char *s; const char *e; for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) { rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */ } if (base_locale_array[base_locale_len].num_weights < w->num_weights) { base_locale_array[base_locale_len].num_weights = w->num_weights; } for (i=0 ; i < w->num_weights ; i++) { assert(rule2val[(int)(w->rule[i])] >= 0); assert(w->colitem[i] && *w->colitem[i]); if (*w->colitem[i] == '"') { /* string... */ s = w->colitem[i] + 1; assert(*s == '<'); n = 0; do { e = s; do { if (*e == '/') { e += 2; continue; } } while (*e++ != '>'); assert(((size_t)(e-s) < sizeof(buf))); memcpy(buf, s, (size_t)(e-s)); buf[(size_t)(e-s)] = 0; r = final_index_val_x(buf, wi->symbol); assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0])); ws_buf[n++] = r | rule2val[(int)(w->rule[i])]; s = e; } while (*s != '"'); ws_buf[n++] = 0; /* terminator */ mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]), ws_buf, n*sizeof(ws_buf[0])); if (!mm) { assert(weightstr_len + n < WEIGHTSTR_LEN); memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0])); mm = weightstr_buffer + weightstr_len; weightstr_len += n; } r = (((uint16_t *)(mm)) - weightstr_buffer) + base_locale_array[base_locale_len].max_col_index + 2; assert(r < (1 << 14)); rbuf[i] = r | rule2val[(int)(w->rule[i])]; } else { /* item */ r = final_index_val_x(w->colitem[i], wi->symbol); rbuf[i] = r | rule2val[(int)(w->rule[i])]; } } for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) { if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) { return i/MAX_COLLATION_WEIGHTS; } } memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0])); ruletable_len += MAX_COLLATION_WEIGHTS; return (ruletable_len / MAX_COLLATION_WEIGHTS)-1; } static unsigned int add_range_rule(range_item_t *ri) { weight_t *w = ri->weight; int i, j, r, n; uint16_t rbuf[MAX_COLLATION_WEIGHTS]; uint16_t ws_buf[32]; void *mm; char buf[32]; const char *s; const char *e; for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) { rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */ } if (base_locale_array[base_locale_len].num_weights < w->num_weights) { base_locale_array[base_locale_len].num_weights = w->num_weights; } for (i=0 ; i < w->num_weights ; i++) { assert(rule2val[(int)(w->rule[i])] >= 0); assert(w->colitem[i] && *w->colitem[i]); if (*w->colitem[i] == '"') { /* string... */ s = w->colitem[i] + 1; assert(*s == '<'); n = 0; do { e = s; do { if (*e == '/') { e += 2; continue; } } while (*e++ != '>'); assert(((size_t)(e-s) < sizeof(buf))); memcpy(buf, s, (size_t)(e-s)); buf[(size_t)(e-s)] = 0; r = final_index_val_x(buf, "RANGE"); assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0])); ws_buf[n++] = r | rule2val[(int)(w->rule[i])]; s = e; } while (*s != '"'); ws_buf[n++] = 0; /* terminator */ mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]), ws_buf, n*sizeof(ws_buf[0])); if (!mm) { assert(weightstr_len + n < WEIGHTSTR_LEN); memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0])); mm = weightstr_buffer + weightstr_len; weightstr_len += n; } r = (((uint16_t *)(mm)) - weightstr_buffer) + base_locale_array[base_locale_len].max_col_index + 2; assert(r < (1 << 14)); rbuf[i] = r | rule2val[(int)(w->rule[i])]; } else { /* item */ r = final_index_val_x(w->colitem[i], "RANGE"); rbuf[i] = r | rule2val[(int)(w->rule[i])]; } } for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) { if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) { return i/MAX_COLLATION_WEIGHTS; } } memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0])); ruletable_len += MAX_COLLATION_WEIGHTS; return (ruletable_len / MAX_COLLATION_WEIGHTS)-1; } #define DUMPn(X) fprintf(stderr, "%10d-%-.20s", base_locale_array[n]. X, #X); static void dump_base_locale(int n) { assert(n < base_locale_len); fprintf(stderr, "Base Locale: %s\n", base_locale_array[n].name); DUMPn(num_weights); DUMPn(ii_shift); DUMPn(ti_shift); DUMPn(ii_len); DUMPn(ti_len); DUMPn(max_weight); fprintf(stderr, "\n"); DUMPn(num_col_base); DUMPn(max_col_index); DUMPn(undefined_idx); DUMPn(range_low); DUMPn(range_count); fprintf(stderr, "\n"); DUMPn(range_base_weight); DUMPn(num_starters); fprintf(stderr, "\n"); DUMPn(range_rule_offset); DUMPn(wcs2colidt_offset); DUMPn(index2weight_offset); fprintf(stderr, "\n"); DUMPn(index2ruleidx_offset); DUMPn(multistart_offset); fprintf(stderr, "\n"); } #undef DUMPn #define DUMPn(X) fprintf(stderr, "%10d-%s", der_locale_array[n]. X, #X); static void dump_der_locale(int n) { assert(n < der_locale_len); fprintf(stderr, "Derived Locale: %s (%.12s)", der_locale_array[n].name, base_locale_array[der_locale_array[n].base_idx].name); DUMPn(base_idx); DUMPn(undefined_idx); DUMPn(overrides_offset); DUMPn(multistart_offset); fprintf(stderr, "\n"); } static unsigned long collate_pos; static void dump_u16_array(FILE *fp, uint16_t *u, int len, const char *name) { int i; fprintf(fp, "\t/* %8lu %s */\n", collate_pos, name); for (i=0 ; i < len ; i++) { if (!(i & 7)) { fprintf(fp, "\n\t"); } fprintf(fp," %#06x,", (unsigned int)(u[i])); } fprintf(fp,"\n"); collate_pos += len; } #define OUT_U16C(X,N) fprintf(fp,"\t%10d, /* %8lu %s */\n", X, collate_pos++, N); static void dump_collate(FILE *fp) { int n; fprintf(fp, "const uint16_t __locale_collate_tbl[] = {\n"); OUT_U16C(base_locale_len, "numbef of base locales"); OUT_U16C(der_locale_len, "number of derived locales"); OUT_U16C(MAX_COLLATION_WEIGHTS, "max collation weights"); OUT_U16C(index2weight_len, "number of index2{weight|ruleidx} elements"); OUT_U16C(weightstr_len, "number of weightstr elements"); OUT_U16C(multistart_len, "number of multistart elements"); OUT_U16C(override_len, "number of override elements"); OUT_U16C(ruletable_len, "number of ruletable elements"); #undef DUMPn #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", base_locale_array[n]. X, collate_pos++, #X); for (n=0 ; n < base_locale_len ; n++) { unsigned wcs2colidt_offset_low = base_locale_array[n].wcs2colidt_offset & 0xffffU; unsigned wcs2colidt_offset_hi = base_locale_array[n].wcs2colidt_offset >> 16; fprintf(fp, "\t/* Base Locale %2d: %s */\n", n, base_locale_array[n].name); DUMPn(num_weights); DUMPn(num_starters); DUMPn(ii_shift); DUMPn(ti_shift); DUMPn(ii_len); DUMPn(ti_len); DUMPn(max_weight); DUMPn(num_col_base); DUMPn(max_col_index); DUMPn(undefined_idx); DUMPn(range_low); DUMPn(range_count); DUMPn(range_base_weight); DUMPn(range_rule_offset); DUMPn(index2weight_offset); DUMPn(index2ruleidx_offset); DUMPn(multistart_offset); #undef DUMPn #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", X, collate_pos++, #X); DUMPn(wcs2colidt_offset_low); DUMPn(wcs2colidt_offset_hi); } #undef DUMPn fprintf(fp, "#define COL_IDX_C %5d\n", 0); #define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", der_locale_array[n]. X, collate_pos++, #X); for (n=0 ; n < der_locale_len ; n++) { fprintf(fp, "#define COL_IDX_%s %5d\n", der_locale_array[n].name, n+1); fprintf(fp, "\t/* Derived Locale %4d: %s (%.12s) */\n", n, der_locale_array[n].name, base_locale_array[der_locale_array[n].base_idx].name); DUMPn(base_idx); DUMPn(undefined_idx); DUMPn(overrides_offset); DUMPn(multistart_offset); } #undef DUMPn fprintf(fp, "\n"); dump_u16_array(fp, index2weight_buffer, index2weight_len, "index2weight"); dump_u16_array(fp, index2ruleidx_buffer, index2ruleidx_len, "index2ruleidx"); dump_u16_array(fp, multistart_buffer, multistart_len, "multistart"); dump_u16_array(fp, override_buffer, override_len, "override"); dump_u16_array(fp, ruletable_buffer, ruletable_len, "ruletable"); dump_u16_array(fp, weightstr_buffer, weightstr_len, "weightstr"); dump_u16_array(fp, wcs2colidt_buffer, wcs2colidt_len, "wcs2colidt"); fprintf(fp,"}; /* %8lu */\n", collate_pos); fprintf(fp,"#define __lc_collate_data_LEN %lu\n\n", collate_pos); }