pmt: initial 3.0.2 update
This commit is contained in:
515
include/libgnulib/unicase/u-ct-totitle.h
Executable file
515
include/libgnulib/unicase/u-ct-totitle.h
Executable file
@@ -0,0 +1,515 @@
|
||||
/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
|
||||
Copyright (C) 2009-2024 Free Software Foundation, Inc.
|
||||
Written by Bruno Haible <bruno@clisp.org>, 2009.
|
||||
|
||||
This file is free software.
|
||||
It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
You can redistribute it and/or modify it under either
|
||||
- the terms of the GNU Lesser General Public License as published
|
||||
by the Free Software Foundation, either version 3, or (at your
|
||||
option) any later version, or
|
||||
- the terms of the GNU General Public License as published by the
|
||||
Free Software Foundation; either version 2, or (at your option)
|
||||
any later version, or
|
||||
- the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License and the GNU General Public License
|
||||
for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License and of the GNU General Public License along with this
|
||||
program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Quoting the Unicode standard, section "Default Case Algorithms":
|
||||
Find the word boundaries in X according to Unicode Standard Annex #29,
|
||||
“Text Boundaries.” For each word boundary, find the first cased character
|
||||
F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
|
||||
then map all characters C between F and the following word boundary to
|
||||
Lowercase_Mapping(C). */
|
||||
|
||||
UNIT *
|
||||
FUNC (const UNIT *s, size_t n,
|
||||
casing_prefix_context_t prefix_context,
|
||||
casing_suffix_context_t suffix_context,
|
||||
const char *iso639_language,
|
||||
uninorm_t nf,
|
||||
UNIT *resultbuf, size_t *lengthp)
|
||||
{
|
||||
/* The result being accumulated. */
|
||||
UNIT *result;
|
||||
size_t length;
|
||||
size_t allocated;
|
||||
/* An array containing the word break positions. */
|
||||
char *wordbreaks;
|
||||
|
||||
/* Initialize the accumulator. */
|
||||
if (nf != NULL || resultbuf == NULL)
|
||||
{
|
||||
result = NULL;
|
||||
allocated = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = resultbuf;
|
||||
allocated = *lengthp;
|
||||
}
|
||||
length = 0;
|
||||
|
||||
/* Initialize the word breaks array. */
|
||||
if (n > 0)
|
||||
{
|
||||
wordbreaks = (char *) malloc (n);
|
||||
if (wordbreaks == NULL)
|
||||
{
|
||||
errno = ENOMEM;
|
||||
goto fail2;
|
||||
}
|
||||
U_WORDBREAKS (s, n, wordbreaks);
|
||||
}
|
||||
else
|
||||
wordbreaks = NULL;
|
||||
|
||||
{
|
||||
const UNIT *s_end = s + n;
|
||||
const char *wp = wordbreaks;
|
||||
|
||||
/* When considering the string as segmented by word boundaries: For each
|
||||
such segment:
|
||||
- In the first part, we are searching for the first cased character.
|
||||
In this state, in_word_first_part = true, and no conversion takes
|
||||
place.
|
||||
- In the second part, we are converting every character: the first
|
||||
among these characters to title case, the other ones to lower case.
|
||||
In this state, in_word_first_part = false. */
|
||||
bool in_word_first_part = true;
|
||||
|
||||
/* Helper for evaluating the FINAL_SIGMA condition:
|
||||
Last character that was not case-ignorable. */
|
||||
ucs4_t last_char_except_ignorable =
|
||||
prefix_context.last_char_except_ignorable;
|
||||
|
||||
/* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
|
||||
Last character that was of combining class 230 ("Above") or 0. */
|
||||
ucs4_t last_char_normal_or_above =
|
||||
prefix_context.last_char_normal_or_above;
|
||||
|
||||
while (s < s_end)
|
||||
{
|
||||
/* Fetch the next character. */
|
||||
ucs4_t uc;
|
||||
int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
|
||||
|
||||
ucs4_t (*single_character_map) (ucs4_t);
|
||||
size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
|
||||
|
||||
ucs4_t mapped_uc[3];
|
||||
unsigned int mapped_count;
|
||||
|
||||
if (*wp)
|
||||
/* Crossing a word boundary. */
|
||||
in_word_first_part = true;
|
||||
|
||||
/* Determine single_character_map, offset_in_rule.
|
||||
There are three possibilities:
|
||||
- uc should not be converted.
|
||||
- uc should be titlecased.
|
||||
- uc should be lowercased. */
|
||||
if (in_word_first_part)
|
||||
{
|
||||
if (uc_is_cased (uc))
|
||||
{
|
||||
/* uc is to be titlecased. */
|
||||
single_character_map = uc_totitle;
|
||||
offset_in_rule = offsetof (struct special_casing_rule, title[0]);
|
||||
in_word_first_part = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* uc is not converted. */
|
||||
single_character_map = NULL;
|
||||
offset_in_rule = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* uc is to be lowercased. */
|
||||
single_character_map = uc_tolower;
|
||||
offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
|
||||
}
|
||||
|
||||
/* Actually map uc. */
|
||||
if (single_character_map == NULL)
|
||||
{
|
||||
mapped_uc[0] = uc;
|
||||
mapped_count = 1;
|
||||
goto found_mapping;
|
||||
}
|
||||
|
||||
if (uc < 0x10000)
|
||||
{
|
||||
/* Look first in the special-casing table. */
|
||||
char code[3];
|
||||
|
||||
code[0] = (uc >> 8) & 0xff;
|
||||
code[1] = uc & 0xff;
|
||||
|
||||
for (code[2] = 0; ; code[2]++)
|
||||
{
|
||||
const struct special_casing_rule *rule =
|
||||
gl_unicase_special_lookup (code, 3);
|
||||
|
||||
if (rule == NULL)
|
||||
break;
|
||||
|
||||
/* Test if the condition applies. */
|
||||
/* Does the language apply? */
|
||||
if (rule->language[0] == '\0'
|
||||
|| (iso639_language != NULL
|
||||
&& iso639_language[0] == rule->language[0]
|
||||
&& iso639_language[1] == rule->language[1]))
|
||||
{
|
||||
/* Does the context apply? */
|
||||
int context = rule->context;
|
||||
bool applies;
|
||||
|
||||
if (context < 0)
|
||||
context = - context;
|
||||
switch (context)
|
||||
{
|
||||
case SCC_ALWAYS:
|
||||
applies = true;
|
||||
break;
|
||||
|
||||
case SCC_FINAL_SIGMA:
|
||||
/* "Before" condition: preceded by a sequence
|
||||
consisting of a cased letter and a case-ignorable
|
||||
sequence.
|
||||
"After" condition: not followed by a sequence
|
||||
consisting of a case-ignorable sequence and then a
|
||||
cased letter. */
|
||||
/* Test the "before" condition. */
|
||||
applies = uc_is_cased (last_char_except_ignorable);
|
||||
/* Test the "after" condition. */
|
||||
if (applies)
|
||||
{
|
||||
const UNIT *s2 = s + count;
|
||||
for (;;)
|
||||
{
|
||||
if (s2 < s_end)
|
||||
{
|
||||
ucs4_t uc2;
|
||||
int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
|
||||
/* Our uc_is_case_ignorable function is
|
||||
known to return false for all cased
|
||||
characters. So we can call
|
||||
uc_is_case_ignorable first. */
|
||||
if (!uc_is_case_ignorable (uc2))
|
||||
{
|
||||
applies = ! uc_is_cased (uc2);
|
||||
break;
|
||||
}
|
||||
s2 += count2;
|
||||
}
|
||||
else
|
||||
{
|
||||
applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case SCC_AFTER_SOFT_DOTTED:
|
||||
/* "Before" condition: There is a Soft_Dotted character
|
||||
before it, with no intervening character of
|
||||
combining class 0 or 230 (Above). */
|
||||
/* Test the "before" condition. */
|
||||
applies = uc_is_property_soft_dotted (last_char_normal_or_above);
|
||||
break;
|
||||
|
||||
case SCC_MORE_ABOVE:
|
||||
/* "After" condition: followed by a character of
|
||||
combining class 230 (Above) with no intervening
|
||||
character of combining class 0 or 230 (Above). */
|
||||
/* Test the "after" condition. */
|
||||
{
|
||||
const UNIT *s2 = s + count;
|
||||
applies = false;
|
||||
for (;;)
|
||||
{
|
||||
if (s2 < s_end)
|
||||
{
|
||||
ucs4_t uc2;
|
||||
int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
|
||||
int ccc = uc_combining_class (uc2);
|
||||
if (ccc == UC_CCC_A)
|
||||
{
|
||||
applies = true;
|
||||
break;
|
||||
}
|
||||
if (ccc == UC_CCC_NR)
|
||||
break;
|
||||
s2 += count2;
|
||||
}
|
||||
else
|
||||
{
|
||||
applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case SCC_BEFORE_DOT:
|
||||
/* "After" condition: followed by COMBINING DOT ABOVE
|
||||
(U+0307). Any sequence of characters with a
|
||||
combining class that is neither 0 nor 230 may
|
||||
intervene between the current character and the
|
||||
combining dot above. */
|
||||
/* Test the "after" condition. */
|
||||
{
|
||||
const UNIT *s2 = s + count;
|
||||
applies = false;
|
||||
for (;;)
|
||||
{
|
||||
if (s2 < s_end)
|
||||
{
|
||||
ucs4_t uc2;
|
||||
int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
|
||||
if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
|
||||
{
|
||||
applies = true;
|
||||
break;
|
||||
}
|
||||
{
|
||||
int ccc = uc_combining_class (uc2);
|
||||
if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
|
||||
break;
|
||||
}
|
||||
s2 += count2;
|
||||
}
|
||||
else
|
||||
{
|
||||
applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case SCC_AFTER_I:
|
||||
/* "Before" condition: There is an uppercase I before
|
||||
it, and there is no intervening character of
|
||||
combining class 0 or 230 (Above). */
|
||||
/* Test the "before" condition. */
|
||||
applies = (last_char_normal_or_above == 'I');
|
||||
break;
|
||||
|
||||
default:
|
||||
abort ();
|
||||
}
|
||||
if (rule->context < 0)
|
||||
applies = !applies;
|
||||
|
||||
if (applies)
|
||||
{
|
||||
/* The rule applies.
|
||||
Look up the mapping (0 to 3 characters). */
|
||||
const unsigned short *mapped_in_rule =
|
||||
(const unsigned short *)((const char *)rule + offset_in_rule);
|
||||
|
||||
if (mapped_in_rule[0] == 0)
|
||||
mapped_count = 0;
|
||||
else
|
||||
{
|
||||
mapped_uc[0] = mapped_in_rule[0];
|
||||
if (mapped_in_rule[1] == 0)
|
||||
mapped_count = 1;
|
||||
else
|
||||
{
|
||||
mapped_uc[1] = mapped_in_rule[1];
|
||||
if (mapped_in_rule[2] == 0)
|
||||
mapped_count = 2;
|
||||
else
|
||||
{
|
||||
mapped_uc[2] = mapped_in_rule[2];
|
||||
mapped_count = 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
goto found_mapping;
|
||||
}
|
||||
}
|
||||
|
||||
/* Optimization: Save a hash table lookup in the next round. */
|
||||
if (!rule->has_next)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* No special-cased mapping. So use the locale and context independent
|
||||
mapping. */
|
||||
mapped_uc[0] = single_character_map (uc);
|
||||
mapped_count = 1;
|
||||
|
||||
found_mapping:
|
||||
/* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < mapped_count; i++)
|
||||
{
|
||||
ucs4_t muc = mapped_uc[i];
|
||||
|
||||
/* Append muc to the result accumulator. */
|
||||
if (length < allocated)
|
||||
{
|
||||
int ret = U_UCTOMB (result + length, muc, allocated - length);
|
||||
if (ret == -1)
|
||||
{
|
||||
errno = EINVAL;
|
||||
goto fail1;
|
||||
}
|
||||
if (ret >= 0)
|
||||
{
|
||||
length += ret;
|
||||
goto done_appending;
|
||||
}
|
||||
}
|
||||
{
|
||||
size_t old_allocated = allocated;
|
||||
size_t new_allocated = 2 * old_allocated;
|
||||
if (new_allocated < 64)
|
||||
new_allocated = 64;
|
||||
if (new_allocated < old_allocated) /* integer overflow? */
|
||||
abort ();
|
||||
{
|
||||
UNIT *larger_result;
|
||||
if (result == NULL)
|
||||
{
|
||||
larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
|
||||
if (larger_result == NULL)
|
||||
{
|
||||
errno = ENOMEM;
|
||||
goto fail1;
|
||||
}
|
||||
}
|
||||
else if (result == resultbuf)
|
||||
{
|
||||
larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
|
||||
if (larger_result == NULL)
|
||||
{
|
||||
errno = ENOMEM;
|
||||
goto fail1;
|
||||
}
|
||||
U_CPY (larger_result, resultbuf, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
larger_result =
|
||||
(UNIT *) realloc (result, new_allocated * sizeof (UNIT));
|
||||
if (larger_result == NULL)
|
||||
{
|
||||
errno = ENOMEM;
|
||||
goto fail1;
|
||||
}
|
||||
}
|
||||
result = larger_result;
|
||||
allocated = new_allocated;
|
||||
{
|
||||
int ret = U_UCTOMB (result + length, muc, allocated - length);
|
||||
if (ret == -1)
|
||||
{
|
||||
errno = EINVAL;
|
||||
goto fail1;
|
||||
}
|
||||
if (ret < 0)
|
||||
abort ();
|
||||
length += ret;
|
||||
goto done_appending;
|
||||
}
|
||||
}
|
||||
}
|
||||
done_appending: ;
|
||||
}
|
||||
}
|
||||
|
||||
if (!uc_is_case_ignorable (uc))
|
||||
last_char_except_ignorable = uc;
|
||||
|
||||
{
|
||||
int ccc = uc_combining_class (uc);
|
||||
if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
|
||||
last_char_normal_or_above = uc;
|
||||
}
|
||||
|
||||
s += count;
|
||||
wp += count;
|
||||
}
|
||||
}
|
||||
|
||||
free (wordbreaks);
|
||||
|
||||
if (nf != NULL)
|
||||
{
|
||||
/* Finally, normalize the result. */
|
||||
UNIT *normalized_result;
|
||||
|
||||
normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
|
||||
if (normalized_result == NULL)
|
||||
goto fail2;
|
||||
|
||||
free (result);
|
||||
return normalized_result;
|
||||
}
|
||||
|
||||
if (length == 0)
|
||||
{
|
||||
if (result == NULL)
|
||||
{
|
||||
/* Return a non-NULL value. NULL means error. */
|
||||
result = (UNIT *) malloc (1);
|
||||
if (result == NULL)
|
||||
{
|
||||
errno = ENOMEM;
|
||||
goto fail2;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (result != resultbuf && length < allocated)
|
||||
{
|
||||
/* Shrink the allocated memory if possible. */
|
||||
UNIT *memory;
|
||||
|
||||
memory = (UNIT *) realloc (result, length * sizeof (UNIT));
|
||||
if (memory != NULL)
|
||||
result = memory;
|
||||
}
|
||||
|
||||
*lengthp = length;
|
||||
return result;
|
||||
|
||||
fail1:
|
||||
{
|
||||
int saved_errno = errno;
|
||||
free (wordbreaks);
|
||||
errno = saved_errno;
|
||||
}
|
||||
fail2:
|
||||
if (result != resultbuf)
|
||||
{
|
||||
int saved_errno = errno;
|
||||
free (result);
|
||||
errno = saved_errno;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local Variables:
|
||||
* coding: utf-8
|
||||
* End:
|
||||
*/
|
||||
Reference in New Issue
Block a user