libidn 1.43
nfkc.c
Go to the documentation of this file.
1/* nfkc.c --- Unicode normalization utilities.
2 Copyright (C) 2002-2025 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30#ifdef HAVE_CONFIG_H
31# include "config.h"
32#endif
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "stringprep.h"
38
39/* Hacks to make syncing with GLIB code easier. */
40#define gboolean int
41#define gchar char
42#define guchar unsigned char
43#define gint int
44#define guint unsigned int
45#define gushort unsigned short
46#define gint16 int16_t
47#define guint16 uint16_t
48#define gunichar uint32_t
49#define gsize size_t
50#define gssize ssize_t
51#define g_malloc malloc
52#define g_free free
53#define g_return_val_if_fail(expr,val) { \
54 if (!(expr)) \
55 return (val); \
56 }
57
58/* Code from GLIB gmacros.h starts here. */
59
60/* GLIB - Library of useful routines for C programming
61 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
62 *
63 * This library is free software; you can redistribute it and/or
64 * modify it under the terms of the GNU Lesser General Public
65 * License as published by the Free Software Foundation; either
66 * version 2 of the License, or (at your option) any later version.
67 *
68 * This library is distributed in the hope that it will be useful,
69 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
71 * Lesser General Public License for more details.
72 */
73
74#ifndef FALSE
75# define FALSE (0)
76#endif
77
78#ifndef TRUE
79# define TRUE (!FALSE)
80#endif
81
82#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
83
84#define G_UNLIKELY(expr) (expr)
85
86/* Code from GLIB gunicode.h starts here. */
87
88/* gunicode.h - Unicode manipulation functions
89 *
90 * Copyright (C) 1999, 2000 Tom Tromey
91 * Copyright 2000, 2005 Red Hat, Inc.
92 *
93 * The Gnome Library is free software; you can redistribute it and/or
94 * modify it under the terms of the GNU Lesser General Public License as
95 * published by the Free Software Foundation; either version 2 of the
96 * License, or (at your option) any later version.
97 *
98 * The Gnome Library is distributed in the hope that it will be useful,
99 * but WITHOUT ANY WARRANTY; without even the implied warranty of
100 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
101 * Lesser General Public License for more details.
102 */
103
116
117#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
118
119/* Code from GLIB gutf8.c starts here. */
120
121/* gutf8.c - Operations on UTF-8 strings.
122 *
123 * Copyright (C) 1999 Tom Tromey
124 * Copyright (C) 2000 Red Hat, Inc.
125 *
126 * This library is free software; you can redistribute it and/or
127 * modify it under the terms of the GNU Lesser General Public
128 * License as published by the Free Software Foundation; either
129 * version 2 of the License, or (at your option) any later version.
130 *
131 * This library is distributed in the hope that it will be useful,
132 * but WITHOUT ANY WARRANTY; without even the implied warranty of
133 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
134 * Lesser General Public License for more details.
135 */
136
137#define UTF8_COMPUTE(Char, Mask, Len) \
138 if (Char < 128) \
139 { \
140 Len = 1; \
141 Mask = 0x7f; \
142 } \
143 else if ((Char & 0xe0) == 0xc0) \
144 { \
145 Len = 2; \
146 Mask = 0x1f; \
147 } \
148 else if ((Char & 0xf0) == 0xe0) \
149 { \
150 Len = 3; \
151 Mask = 0x0f; \
152 } \
153 else if ((Char & 0xf8) == 0xf0) \
154 { \
155 Len = 4; \
156 Mask = 0x07; \
157 } \
158 else if ((Char & 0xfc) == 0xf8) \
159 { \
160 Len = 5; \
161 Mask = 0x03; \
162 } \
163 else if ((Char & 0xfe) == 0xfc) \
164 { \
165 Len = 6; \
166 Mask = 0x01; \
167 } \
168 else \
169 Len = -1;
170
171#define UTF8_LENGTH(Char) \
172 ((Char) < 0x80 ? 1 : \
173 ((Char) < 0x800 ? 2 : \
174 ((Char) < 0x10000 ? 3 : \
175 ((Char) < 0x200000 ? 4 : \
176 ((Char) < 0x4000000 ? 5 : 6)))))
177
178#define UTF8_GET(Result, Chars, Count, Mask, Len) \
179 (Result) = (Chars)[0] & (Mask); \
180 for ((Count) = 1; (Count) < (Len); ++(Count)) \
181 { \
182 if (((Chars)[(Count)] & 0xc0) != 0x80) \
183 { \
184 (Result) = -1; \
185 break; \
186 } \
187 (Result) <<= 6; \
188 (Result) |= ((Chars)[(Count)] & 0x3f); \
189 }
190
191static const gchar utf8_skip_data[256] = {
192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193 1, 1, 1, 1, 1, 1, 1,
194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195 1, 1, 1, 1, 1, 1, 1,
196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197 1, 1, 1, 1, 1, 1, 1,
198 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
199 1, 1, 1, 1, 1, 1, 1,
200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201 1, 1, 1, 1, 1, 1, 1,
202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1,
204 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
205 2, 2, 2, 2, 2, 2, 2,
206 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
207 5, 5, 5, 6, 6, 1, 1
208};
209
210static const gchar *const g_utf8_skip = utf8_skip_data;
211
212/*
213 * g_utf8_strlen:
214 * @p: pointer to the start of a UTF-8 encoded string
215 * @max: the maximum number of bytes to examine. If @max
216 * is less than 0, then the string is assumed to be
217 * nul-terminated. If @max is 0, @p will not be examined and
218 * may be %NULL.
219 *
220 * Computes the length of the string in characters, not including
221 * the terminating nul character.
222 *
223 * Return value: the length of the string in characters
224 **/
225static gsize
226g_utf8_strlen (const gchar *p)
227{
228 gsize len = 0;
229
230 g_return_val_if_fail (p != NULL, 0);
231
232 while (*p)
233 {
234 p = g_utf8_next_char (p);
235 ++len;
236 }
237
238 return len;
239}
240
241/*
242 * g_utf8_get_char:
243 * @p: a pointer to Unicode character encoded as UTF-8
244 *
245 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
246 * If @p does not point to a valid UTF-8 encoded character, results are
247 * undefined. If you are not sure that the bytes are complete
248 * valid Unicode characters, you should use g_utf8_get_char_validated()
249 * instead.
250 *
251 * Return value: the resulting character
252 **/
253static gunichar
254g_utf8_get_char (const gchar *p)
255{
256 int i, mask = 0, len;
257 gunichar result;
258 unsigned char c = (unsigned char) *p;
259
260 UTF8_COMPUTE (c, mask, len);
261 if (len == -1)
262 return (gunichar) - 1;
263 UTF8_GET (result, p, i, mask, len);
264
265 return result;
266}
267
268/*
269 * g_unichar_to_utf8:
270 * @c: a Unicode character code
271 * @outbuf: output buffer, must have at least 6 bytes of space.
272 * If %NULL, the length will be computed and returned
273 * and nothing will be written to @outbuf.
274 *
275 * Converts a single character to UTF-8.
276 *
277 * Return value: number of bytes written
278 **/
279static int
280g_unichar_to_utf8 (gunichar c, gchar *outbuf)
281{
282 /* If this gets modified, also update the copy in g_string_insert_unichar() */
283 guint len = 0;
284 int first;
285 int i;
286
287 if (c < 0x80)
288 {
289 first = 0;
290 len = 1;
291 }
292 else if (c < 0x800)
293 {
294 first = 0xc0;
295 len = 2;
296 }
297 else if (c < 0x10000)
298 {
299 first = 0xe0;
300 len = 3;
301 }
302 else if (c < 0x200000)
303 {
304 first = 0xf0;
305 len = 4;
306 }
307 else if (c < 0x4000000)
308 {
309 first = 0xf8;
310 len = 5;
311 }
312 else
313 {
314 first = 0xfc;
315 len = 6;
316 }
317
318 if (outbuf)
319 {
320 for (i = len - 1; i > 0; --i)
321 {
322 outbuf[i] = (c & 0x3f) | 0x80;
323 c >>= 6;
324 }
325 outbuf[0] = c | first;
326 }
327
328 return len;
329}
330
331/*
332 * g_utf8_to_ucs4_fast:
333 * @str: a UTF-8 encoded string
334 * @len: the maximum length of @str to use, in bytes. If @len < 0,
335 * then the string is nul-terminated.
336 * @items_written: location to store the number of characters in the
337 * result, or %NULL.
338 *
339 * Convert a string from UTF-8 to a 32-bit fixed width
340 * representation as UCS-4, assuming valid UTF-8 input.
341 * This function is roughly twice as fast as g_utf8_to_ucs4()
342 * but does no error checking on the input. A trailing 0 character
343 * will be added to the string after the converted text.
344 *
345 * Return value: a pointer to a newly allocated UCS-4 string.
346 * This value must be freed with g_free().
347 **/
348static gunichar *
349g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
350{
351 gunichar *result;
352 gsize n_chars, i;
353 const gchar *p;
354
355 g_return_val_if_fail (str != NULL, NULL);
356
357 p = str;
358 n_chars = 0;
359 if (len < 0)
360 {
361 while (*p)
362 {
363 p = g_utf8_next_char (p);
364 ++n_chars;
365 }
366 }
367 else
368 {
369 while (p < str + len && *p)
370 {
371 p = g_utf8_next_char (p);
372 ++n_chars;
373 }
374 }
375
376 result = g_malloc (sizeof (gunichar) * (n_chars + 1));
377 if (!result)
378 return NULL;
379
380 p = str;
381 for (i = 0; i < n_chars; i++)
382 {
383 gunichar wc = (guchar) * p++;
384
385 if (wc < 0x80)
386 {
387 result[i] = wc;
388 }
389 else
390 {
391 gunichar mask = 0x40;
392
393 if (G_UNLIKELY ((wc & mask) == 0))
394 {
395 /* It's an out-of-sequence 10xxxxxxx byte.
396 * Rather than making an ugly hash of this and the next byte
397 * and overrunning the buffer, it's more useful to treat it
398 * with a replacement character */
399 result[i] = 0xfffd;
400 continue;
401 }
402
403 do
404 {
405 wc <<= 6;
406 wc |= (guchar) (*p++) & 0x3f;
407 mask <<= 5;
408 }
409 while ((wc & mask) != 0);
410
411 wc &= mask - 1;
412
413 result[i] = wc;
414 }
415 }
416 result[i] = 0;
417
418 if (items_written)
419 *items_written = i;
420
421 return result;
422}
423
424/*
425 * g_ucs4_to_utf8:
426 * @str: a UCS-4 encoded string
427 * @len: the maximum length (number of characters) of @str to use.
428 * If @len < 0, then the string is nul-terminated.
429 * @items_read: location to store number of characters read, or %NULL.
430 * @items_written: location to store number of bytes written or %NULL.
431 * The value here stored does not include the trailing 0
432 * byte.
433 * @error: location to store the error occurring, or %NULL to ignore
434 * errors. Any of the errors in #GConvertError other than
435 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
436 *
437 * Convert a string from a 32-bit fixed width representation as UCS-4.
438 * to UTF-8. The result will be terminated with a 0 byte.
439 *
440 * Return value: a pointer to a newly allocated UTF-8 string.
441 * This value must be freed with g_free(). If an
442 * error occurs, %NULL will be returned and
443 * @error set. In that case, @items_read will be
444 * set to the position of the first invalid input
445 * character.
446 **/
447static gchar *
448g_ucs4_to_utf8 (const gunichar *str,
449 gsize len, gsize *items_read, gsize *items_written)
450{
451 gint result_length;
452 gchar *result = NULL;
453 gchar *p;
454 gsize i;
455
456 result_length = 0;
457 for (i = 0; i < len; i++)
458 {
459 if (!str[i])
460 break;
461
462 if (str[i] >= 0x80000000)
463 goto err_out;
464
465 result_length += UTF8_LENGTH (str[i]);
466 }
467
468 result = g_malloc (result_length + 1);
469 if (!result)
470 return NULL;
471 p = result;
472
473 i = 0;
474 while (p < result + result_length)
475 p += g_unichar_to_utf8 (str[i++], p);
476
477 *p = '\0';
478
479 if (items_written)
480 *items_written = p - result;
481
482err_out:
483 if (items_read)
484 *items_read = i;
485
486 return result;
487}
488
489/* Code from GLIB gunidecomp.c starts here. */
490
491/* decomp.c - Character decomposition.
492 *
493 * Copyright (C) 1999, 2000 Tom Tromey
494 * Copyright 2000 Red Hat, Inc.
495 *
496 * The Gnome Library is free software; you can redistribute it and/or
497 * modify it under the terms of the GNU Lesser General Public License as
498 * published by the Free Software Foundation; either version 2 of the
499 * License, or (at your option) any later version.
500 *
501 * The Gnome Library is distributed in the hope that it will be useful,
502 * but WITHOUT ANY WARRANTY; without even the implied warranty of
503 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
504 * Lesser General Public License for more details.
505 */
506
507#include "gunidecomp.h"
508#include "gunicomp.h"
509
510#define CC_PART1(Page, Char) \
511 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
512 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
513 : (cclass_data[combining_class_table_part1[Page]][Char]))
514
515#define CC_PART2(Page, Char) \
516 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
517 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
518 : (cclass_data[combining_class_table_part2[Page]][Char]))
519
520#define COMBINING_CLASS(Char) \
521 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
522 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
523 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
524 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
525 : 0))
526
527/* constants for hangul syllable [de]composition */
528#define SBase 0xAC00
529#define LBase 0x1100
530#define VBase 0x1161
531#define TBase 0x11A7
532#define LCount 19
533#define VCount 21
534#define TCount 28
535#define NCount (VCount * TCount)
536#define SCount (LCount * NCount)
537
538/*
539 * g_unicode_canonical_ordering:
540 * @string: a UCS-4 encoded string.
541 * @len: the maximum length of @string to use.
542 *
543 * Computes the canonical ordering of a string in-place.
544 * This rearranges decomposed characters in the string
545 * according to their combining classes. See the Unicode
546 * manual for more information.
547 **/
548static void
549g_unicode_canonical_ordering (gunichar *string, gsize len)
550{
551 gsize i;
552 int swap = 1;
553
554 while (swap)
555 {
556 int last;
557 swap = 0;
558 last = COMBINING_CLASS (string[0]);
559 for (i = 0; i < len - 1; ++i)
560 {
561 int next = COMBINING_CLASS (string[i + 1]);
562 if (next != 0 && last > next)
563 {
564 gsize j;
565 /* Percolate item leftward through string. */
566 for (j = i + 1; j > 0; --j)
567 {
568 gunichar t;
569 if (COMBINING_CLASS (string[j - 1]) <= next)
570 break;
571 t = string[j];
572 string[j] = string[j - 1];
573 string[j - 1] = t;
574 swap = 1;
575 }
576 /* We're re-entering the loop looking at the old
577 character again. */
578 next = last;
579 }
580 last = next;
581 }
582 }
583}
584
585/* http://www.unicode.org/unicode/reports/tr15/#Hangul
586 * r should be null or have sufficient space. Calling with r == NULL will
587 * only calculate the result_len; however, a buffer with space for three
588 * characters will always be big enough. */
589static void
590decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
591{
592 gint SIndex = s - SBase;
593 gint TIndex = SIndex % TCount;
594
595 if (r)
596 {
597 r[0] = LBase + SIndex / NCount;
598 r[1] = VBase + (SIndex % NCount) / TCount;
599 }
600
601 if (TIndex)
602 {
603 if (r)
604 r[2] = TBase + TIndex;
605 *result_len = 3;
606 }
607 else
608 *result_len = 2;
609}
610
611/* returns a pointer to a null-terminated UTF-8 string */
612static const gchar *
613find_decomposition (gunichar ch, gboolean compat)
614{
615 int start = 0;
616 int end = G_N_ELEMENTS (decomp_table);
617
618 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619 {
620 while (TRUE)
621 {
622 int half = (start + end) / 2;
623 if (ch == decomp_table[half].ch)
624 {
625 int offset;
626
627 if (compat)
628 {
629 offset = decomp_table[half].compat_offset;
630 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631 offset = decomp_table[half].canon_offset;
632 }
633 else
634 {
635 offset = decomp_table[half].canon_offset;
636 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637 return NULL;
638 }
639
640 return &(decomp_expansion_string[offset]);
641 }
642 else if (half == start)
643 break;
644 else if (ch > decomp_table[half].ch)
645 start = half;
646 else
647 end = half;
648 }
649 }
650
651 return NULL;
652}
653
654/* L,V => LV and LV,T => LVT */
655static gboolean
656combine_hangul (gunichar a, gunichar b, gunichar *result)
657{
658 if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
659 {
660 gint LIndex = a - LBase;
661 gint VIndex = b - VBase;
662
663 *result = SBase + (LIndex * VCount + VIndex) * TCount;
664 return TRUE;
665 }
666
667 if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
668 {
669 gint SIndex = a - SBase;
670
671 if ((SIndex % TCount) == 0)
672 {
673 gint TIndex = b - TBase;
674
675 *result = a + TIndex;
676 return TRUE;
677 }
678 }
679
680 return FALSE;
681}
682
683#define CI(Page, Char) \
684 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
685 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
686 : (compose_data[compose_table[Page]][Char]))
687
688#define COMPOSE_INDEX(Char) \
689 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
690
691static gboolean
692combine (gunichar a, gunichar b, gunichar *result)
693{
694 gushort index_a, index_b;
695
696 if (combine_hangul (a, b, result))
697 return TRUE;
698
699 index_a = COMPOSE_INDEX (a);
700
701 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
702 {
703 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
704 {
705 *result =
706 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
707 return TRUE;
708 }
709 else
710 return FALSE;
711 }
712
713 index_b = COMPOSE_INDEX (b);
714
715 if (index_b >= COMPOSE_SECOND_SINGLE_START)
716 {
717 if (a ==
718 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
719 {
720 *result =
721 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
722 return TRUE;
723 }
724 else
725 return FALSE;
726 }
727
728 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
729 && index_b >= COMPOSE_SECOND_START
730 && index_b < COMPOSE_SECOND_SINGLE_START)
731 {
732 gunichar res =
733 compose_array[index_a - COMPOSE_FIRST_START][index_b -
735
736 if (res)
737 {
738 *result = res;
739 return TRUE;
740 }
741 }
742
743 return FALSE;
744}
745
746static gunichar *
747_g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
748{
749 gsize n_wc;
750 gunichar *wc_buffer;
751 const char *p;
752 gsize last_start;
753 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
754 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
755
756 n_wc = 0;
757 p = str;
758 while ((max_len < 0 || p < str + max_len) && *p)
759 {
760 const gchar *decomp;
761 gunichar wc = g_utf8_get_char (p);
762
763 if (wc >= SBase && wc < SBase + SCount)
764 {
765 gsize result_len;
766 decompose_hangul (wc, NULL, &result_len);
767 n_wc += result_len;
768 }
769 else
770 {
771 decomp = find_decomposition (wc, do_compat);
772
773 if (decomp)
774 n_wc += g_utf8_strlen (decomp);
775 else
776 n_wc++;
777 }
778
779 p = g_utf8_next_char (p);
780 }
781
782 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
783 if (!wc_buffer)
784 return NULL;
785
786 last_start = 0;
787 n_wc = 0;
788 p = str;
789 while ((max_len < 0 || p < str + max_len) && *p)
790 {
791 gunichar wc = g_utf8_get_char (p);
792 const gchar *decomp;
793 int cc;
794 gsize old_n_wc = n_wc;
795
796 if (wc >= SBase && wc < SBase + SCount)
797 {
798 gsize result_len;
799 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
800 n_wc += result_len;
801 }
802 else
803 {
804 decomp = find_decomposition (wc, do_compat);
805
806 if (decomp)
807 {
808 const char *pd;
809 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
810 wc_buffer[n_wc++] = g_utf8_get_char (pd);
811 }
812 else
813 wc_buffer[n_wc++] = wc;
814 }
815
816 if (n_wc > 0)
817 {
818 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
819
820 if (cc == 0)
821 {
822 g_unicode_canonical_ordering (wc_buffer + last_start,
823 n_wc - last_start);
824 last_start = old_n_wc;
825 }
826 }
827
828 p = g_utf8_next_char (p);
829 }
830
831 if (n_wc > 0)
832 {
833 g_unicode_canonical_ordering (wc_buffer + last_start,
834 n_wc - last_start);
835 /* dead assignment: last_start = n_wc; */
836 }
837
838 wc_buffer[n_wc] = 0;
839
840 /* All decomposed and reordered */
841
842 if (do_compose && n_wc > 0)
843 {
844 gsize i, j;
845 int last_cc = 0;
846 last_start = 0;
847
848 for (i = 0; i < n_wc; i++)
849 {
850 int cc = COMBINING_CLASS (wc_buffer[i]);
851
852 if (i > 0 &&
853 (last_cc == 0 || last_cc != cc) &&
854 combine (wc_buffer[last_start], wc_buffer[i],
855 &wc_buffer[last_start]))
856 {
857 for (j = i + 1; j < n_wc; j++)
858 wc_buffer[j - 1] = wc_buffer[j];
859 n_wc--;
860 i--;
861
862 if (i == last_start)
863 last_cc = 0;
864 else
865 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
866
867 continue;
868 }
869
870 if (cc == 0)
871 last_start = i;
872
873 last_cc = cc;
874 }
875 }
876
877 wc_buffer[n_wc] = 0;
878
879 return wc_buffer;
880}
881
882/*
883 * g_utf8_normalize:
884 * @str: a UTF-8 encoded string.
885 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
886 * @mode: the type of normalization to perform.
887 *
888 * Converts a string into canonical form, standardizing
889 * such issues as whether a character with an accent
890 * is represented as a base character and combining
891 * accent or as a single precomposed character. The
892 * string has to be valid UTF-8, otherwise %NULL is
893 * returned. You should generally call g_utf8_normalize()
894 * before comparing two Unicode strings.
895 *
896 * The normalization mode %G_NORMALIZE_DEFAULT only
897 * standardizes differences that do not affect the
898 * text content, such as the above-mentioned accent
899 * representation. %G_NORMALIZE_ALL also standardizes
900 * the "compatibility" characters in Unicode, such
901 * as SUPERSCRIPT THREE to the standard forms
902 * (in this case DIGIT THREE). Formatting information
903 * may be lost but for most text operations such
904 * characters should be considered the same.
905 *
906 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
907 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
908 * but returned a result with composed forms rather
909 * than a maximally decomposed form. This is often
910 * useful if you intend to convert the string to
911 * a legacy encoding or pass it to a system with
912 * less capable Unicode handling.
913 *
914 * Return value: a newly allocated string, that is the
915 * normalized form of @str, or %NULL if @str is not
916 * valid UTF-8.
917 **/
918static gchar *
919g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
920{
921 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
922 gchar *result = NULL;
923
924 if (result_wc)
925 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
926
927 g_free (result_wc);
928
929 return result;
930}
931
932/* Public Libidn API starts here. */
933
944uint32_t
946{
947 return g_utf8_get_char (p);
948}
949
961int
962stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
963{
964 return g_unichar_to_utf8 (c, outbuf);
965}
966
967#include <unistr.h>
968
985uint32_t *
986stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
987{
988 size_t n;
989
990 if (len < 0)
991 n = strlen (str);
992 else
993 n = len;
994
995 if (u8_check ((const uint8_t *) str, n))
996 return NULL;
997
998 return g_utf8_to_ucs4_fast (str, len, items_written);
999}
1000
1018char *
1019stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1020 size_t *items_read, size_t *items_written)
1021{
1022 return g_ucs4_to_utf8 (str, len, items_read, items_written);
1023}
1024
1047char *
1048stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1049{
1050 size_t n;
1051
1052 if (len < 0)
1053 n = strlen (str);
1054 else
1055 n = len;
1056
1057 if (u8_check ((const uint8_t *) str, n))
1058 return NULL;
1059
1060 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1061}
1062
1063#include <stdio.h>
1075uint32_t *
1076stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1077{
1078 char *p;
1079 uint32_t *result_wc;
1080
1081 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1082 if (!p)
1083 return NULL;
1084
1085 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1086 free (p);
1087
1088 return result_wc;
1089}
#define COMPOSE_SECOND_SINGLE_START
Definition gunicomp.h:7
#define COMPOSE_SECOND_START
Definition gunicomp.h:6
#define COMPOSE_FIRST_START
Definition gunicomp.h:4
#define COMPOSE_FIRST_SINGLE_START
Definition gunicomp.h:5
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition gunidecomp.h:15
#define g_return_val_if_fail(expr, val)
Definition nfkc.c:53
#define SCount
Definition nfkc.c:536
#define gssize
Definition nfkc.c:50
#define gushort
Definition nfkc.c:45
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition nfkc.c:1019
#define UTF8_COMPUTE(Char, Mask, Len)
Definition nfkc.c:137
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition nfkc.c:962
#define gunichar
Definition nfkc.c:48
#define COMPOSE_INDEX(Char)
Definition nfkc.c:688
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition nfkc.c:1076
#define guint
Definition nfkc.c:44
#define g_free
Definition nfkc.c:52
#define G_N_ELEMENTS(arr)
Definition nfkc.c:82
#define gchar
Definition nfkc.c:41
#define LBase
Definition nfkc.c:529
#define gint
Definition nfkc.c:43
#define UTF8_LENGTH(Char)
Definition nfkc.c:171
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition nfkc.c:1048
#define g_utf8_next_char(p)
Definition nfkc.c:117
#define TRUE
Definition nfkc.c:79
#define FALSE
Definition nfkc.c:75
#define G_UNLIKELY(expr)
Definition nfkc.c:84
#define TBase
Definition nfkc.c:531
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition nfkc.c:178
#define VBase
Definition nfkc.c:530
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition nfkc.c:945
#define COMBINING_CLASS(Char)
Definition nfkc.c:520
#define NCount
Definition nfkc.c:535
#define guchar
Definition nfkc.c:42
#define g_malloc
Definition nfkc.c:51
GNormalizeMode
Definition nfkc.c:105
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition nfkc.c:108
@ G_NORMALIZE_NFKC
Definition nfkc.c:113
@ G_NORMALIZE_NFKD
Definition nfkc.c:111
@ G_NORMALIZE_ALL
Definition nfkc.c:110
@ G_NORMALIZE_NFD
Definition nfkc.c:107
@ G_NORMALIZE_DEFAULT
Definition nfkc.c:106
@ G_NORMALIZE_ALL_COMPOSE
Definition nfkc.c:112
@ G_NORMALIZE_NFC
Definition nfkc.c:109
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition nfkc.c:986
#define SBase
Definition nfkc.c:528
#define TCount
Definition nfkc.c:534
#define gsize
Definition nfkc.c:49
#define VCount
Definition nfkc.c:533
#define gboolean
Definition nfkc.c:40