libidn 1.43
idna.c
Go to the documentation of this file.
1/* idna.c --- Prototypes for Internationalized Domain Name library.
2 Copyright (C) 2002-2025 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30#ifdef HAVE_CONFIG_H
31# include "config.h"
32#endif
33
34#include <stdlib.h>
35#include <string.h>
36#include <stringprep.h>
37#include <punycode.h>
38
39#include "idna.h"
40
41/* Get c_strcasecmp. */
42#include <c-strcase.h>
43
44#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45 (c) == 0xFF0E || (c) == 0xFF61)
46
47/* Core functions */
48
80int
81idna_to_ascii_4i (const uint32_t *in, size_t inlen, char *out, int flags)
82{
83 size_t len, outlen;
84 uint32_t *src; /* XXX don't need to copy data? */
85 int rc;
86
87 /*
88 * ToASCII consists of the following steps:
89 *
90 * 1. If all code points in the sequence are in the ASCII range (0..7F)
91 * then skip to step 3.
92 */
93
94 {
95 size_t i;
96 int inasciirange;
97
98 inasciirange = 1;
99 for (i = 0; i < inlen; i++)
100 if (in[i] > 0x7F)
101 inasciirange = 0;
102 if (inasciirange)
103 {
104 src = malloc (sizeof (in[0]) * (inlen + 1));
105 if (src == NULL)
106 return IDNA_MALLOC_ERROR;
107
108 memcpy (src, in, sizeof (in[0]) * inlen);
109 src[inlen] = 0;
110
111 goto step3;
112 }
113 }
114
115 /*
116 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117 * an error. The AllowUnassigned flag is used in [NAMEPREP].
118 */
119
120 {
121 char *p;
122
123 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124 if (p == NULL)
125 return IDNA_MALLOC_ERROR;
126
127 len = strlen (p);
128 do
129 {
130 char *newp;
131
132 len = 2 * len + 10; /* XXX better guess? */
133 newp = realloc (p, len);
134 if (newp == NULL)
135 {
136 free (p);
137 return IDNA_MALLOC_ERROR;
138 }
139 p = newp;
140
141 if (flags & IDNA_ALLOW_UNASSIGNED)
142 rc = stringprep_nameprep (p, len);
143 else
145 }
146 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147
148 if (rc != STRINGPREP_OK)
149 {
150 free (p);
152 }
153
154 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155
156 free (p);
157
158 if (!src)
159 return IDNA_MALLOC_ERROR;
160 }
161
162step3:
163 /*
164 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165 *
166 * (a) Verify the absence of non-LDH ASCII code points; that is,
167 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168 *
169 * (b) Verify the absence of leading and trailing hyphen-minus;
170 * that is, the absence of U+002D at the beginning and end of
171 * the sequence.
172 */
173
174 if (flags & IDNA_USE_STD3_ASCII_RULES)
175 {
176 size_t i;
177
178 for (i = 0; src[i]; i++)
179 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180 (src[i] >= 0x3A && src[i] <= 0x40) ||
181 (src[i] >= 0x5B && src[i] <= 0x60) ||
182 (src[i] >= 0x7B && src[i] <= 0x7F))
183 {
184 free (src);
186 }
187
188 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189 {
190 free (src);
191 return IDNA_CONTAINS_MINUS;
192 }
193 }
194
195 /*
196 * 4. If all code points in the sequence are in the ASCII range
197 * (0..7F), then skip to step 8.
198 */
199
200 {
201 size_t i;
202 int inasciirange;
203
204 inasciirange = 1;
205 for (i = 0; src[i]; i++)
206 {
207 if (src[i] > 0x7F)
208 inasciirange = 0;
209 /* copy string to output buffer if we are about to skip to step8 */
210 if (i < 64)
211 out[i] = src[i];
212 }
213 if (i < 64)
214 out[i] = '\0';
215 else
216 {
217 free (src);
218 return IDNA_INVALID_LENGTH;
219 }
220 if (inasciirange)
221 goto step8;
222 }
223
224 /*
225 * 5. Verify that the sequence does NOT begin with the ACE prefix.
226 *
227 */
228
229 {
230 size_t i;
231 int match;
232
233 match = 1;
234 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
235 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
236 match = 0;
237 if (match)
238 {
239 free (src);
241 }
242 }
243
244 /*
245 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
246 * and fail if there is an error.
247 */
248 for (len = 0; src[len]; len++)
249 ;
250 src[len] = '\0';
251 outlen = 63 - strlen (IDNA_ACE_PREFIX);
252 rc = punycode_encode (len, src, NULL,
253 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
254 if (rc != PUNYCODE_SUCCESS)
255 {
256 free (src);
257 return IDNA_PUNYCODE_ERROR;
258 }
259 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
260
261 /*
262 * 7. Prepend the ACE prefix.
263 */
264
265 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
266
267 /*
268 * 8. Verify that the number of code points is in the range 1 to 63
269 * inclusive (0 is excluded).
270 */
271
272step8:
273 free (src);
274 if (strlen (out) < 1)
275 return IDNA_INVALID_LENGTH;
276
277 return IDNA_SUCCESS;
278}
279
280/* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
281static int
282idna_to_unicode_internal (char *utf8in,
283 uint32_t *out, size_t *outlen, int flags)
284{
285 int rc;
286 char tmpout[64];
287 size_t utf8len = strlen (utf8in) + 1;
288 size_t addlen = 0, addinc = utf8len / 10 + 1;
289
290 /*
291 * ToUnicode consists of the following steps:
292 *
293 * 1. If the sequence contains any code points outside the ASCII range
294 * (0..7F) then proceed to step 2, otherwise skip to step 3.
295 */
296
297 {
298 size_t i;
299 int inasciirange;
300
301 inasciirange = 1;
302 for (i = 0; utf8in[i]; i++)
303 if (utf8in[i] & ~0x7F)
304 inasciirange = 0;
305 if (inasciirange)
306 goto step3;
307 }
308
309 /*
310 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
311 * error. (If step 3 of ToASCII is also performed here, it will not
312 * affect the overall behavior of ToUnicode, but it is not
313 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
314 */
315 do
316 {
317 char *newp = realloc (utf8in, utf8len + addlen);
318 if (newp == NULL)
319 {
320 free (utf8in);
321 return IDNA_MALLOC_ERROR;
322 }
323 utf8in = newp;
324 if (flags & IDNA_ALLOW_UNASSIGNED)
325 rc = stringprep_nameprep (utf8in, utf8len + addlen);
326 else
327 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
328 addlen += addinc;
329 addinc *= 2;
330 }
331 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
332
333 if (rc != STRINGPREP_OK)
334 {
335 free (utf8in);
337 }
338
339 /* 3. Verify that the sequence begins with the ACE prefix, and save a
340 * copy of the sequence.
341 * ... The ToASCII and ToUnicode operations MUST recognize the ACE
342 prefix in a case-insensitive manner.
343 */
344
345step3:
346 if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
347 {
348 free (utf8in);
349 return IDNA_NO_ACE_PREFIX;
350 }
351
352 /* 4. Remove the ACE prefix.
353 */
354
355 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
356 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
357
358 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
359 * and fail if there is an error. Save a copy of the result of
360 * this step.
361 */
362
363 (*outlen)--; /* reserve one for the zero */
364
365 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
366 if (rc != PUNYCODE_SUCCESS)
367 {
368 free (utf8in);
369 return IDNA_PUNYCODE_ERROR;
370 }
371
372 out[*outlen] = 0; /* add zero */
373
374 /* 6. Apply ToASCII.
375 */
376
377 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
378 if (rc != IDNA_SUCCESS)
379 {
380 free (utf8in);
381 return rc;
382 }
383
384 /* 7. Verify that the result of step 6 matches the saved copy from
385 * step 3, using a case-insensitive ASCII comparison.
386 */
387
388 if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
389 {
390 free (utf8in);
392 }
393
394 /* 8. Return the saved copy from step 5.
395 */
396
397 free (utf8in);
398 return IDNA_SUCCESS;
399}
400
436int
437idna_to_unicode_44i (const uint32_t *in, size_t inlen,
438 uint32_t *out, size_t *outlen, int flags)
439{
440 int rc;
441 size_t outlensave = *outlen;
442 char *p;
443
444 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
445 if (p == NULL)
446 return IDNA_MALLOC_ERROR;
447
448 rc = idna_to_unicode_internal (p, out, outlen, flags);
449 if (rc != IDNA_SUCCESS)
450 {
451 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
452 inlen : outlensave));
453 *outlen = inlen;
454 }
455
456 /* p is freed in idna_to_unicode_internal. */
457
458 return rc;
459}
460
461/* Wrappers that handle several labels */
462
476int
477idna_to_ascii_4z (const uint32_t *input, char **output, int flags)
478{
479 const uint32_t *start = input;
480 const uint32_t *end;
481 char buf[64];
482 char *out = NULL;
483 int rc;
484
485 /* 1) Whenever dots are used as label separators, the following
486 characters MUST be recognized as dots: U+002E (full stop),
487 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
488 U+FF61 (halfwidth ideographic full stop). */
489
490 if (input[0] == 0)
491 {
492 /* Handle implicit zero-length root label. */
493 *output = malloc (1);
494 if (!*output)
495 return IDNA_MALLOC_ERROR;
496 strcpy (*output, "");
497 return IDNA_SUCCESS;
498 }
499
500 if (DOTP (input[0]) && input[1] == 0)
501 {
502 /* Handle explicit zero-length root label. */
503 *output = malloc (2);
504 if (!*output)
505 return IDNA_MALLOC_ERROR;
506 strcpy (*output, ".");
507 return IDNA_SUCCESS;
508 }
509
510 *output = NULL;
511 do
512 {
513 end = start;
514
515 for (; *end && !DOTP (*end); end++)
516 ;
517
518 if (*end == '\0' && start == end)
519 {
520 /* Handle explicit zero-length root label. */
521 buf[0] = '\0';
522 }
523 else
524 {
525 rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
526 if (rc != IDNA_SUCCESS)
527 {
528 free (out);
529 return rc;
530 }
531 }
532
533 if (out)
534 {
535 size_t l = strlen (out) + 1 + strlen (buf) + 1;
536 char *newp = realloc (out, l);
537 if (!newp)
538 {
539 free (out);
540 return IDNA_MALLOC_ERROR;
541 }
542 out = newp;
543 strcat (out, ".");
544 strcat (out, buf);
545 }
546 else
547 {
548 out = strdup (buf);
549 if (!out)
550 return IDNA_MALLOC_ERROR;
551 }
552
553 start = end + 1;
554 }
555 while (*end);
556
557 *output = out;
558
559 return IDNA_SUCCESS;
560}
561
575int
576idna_to_ascii_8z (const char *input, char **output, int flags)
577{
578 uint32_t *ucs4;
579 size_t ucs4len;
580 int rc;
581
582 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
583 if (!ucs4)
584 return IDNA_ICONV_ERROR;
585
586 rc = idna_to_ascii_4z (ucs4, output, flags);
587
588 free (ucs4);
589
590 return rc;
591
592}
593
608int
609idna_to_ascii_lz (const char *input, char **output, int flags)
610{
611 char *utf8;
612 int rc;
613
614 utf8 = stringprep_locale_to_utf8 (input);
615 if (!utf8)
616 return IDNA_ICONV_ERROR;
617
618 rc = idna_to_ascii_8z (utf8, output, flags);
619
620 free (utf8);
621
622 return rc;
623}
624
639int
640idna_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
641{
642 const uint32_t *start = input;
643 const uint32_t *end;
644 uint32_t *buf;
645 size_t buflen;
646 uint32_t *out = NULL;
647 size_t outlen = 0;
648 int rc;
649
650 *output = NULL;
651
652 do
653 {
654 end = start;
655
656 for (; *end && !DOTP (*end); end++)
657 ;
658
659 buflen = (size_t) (end - start);
660 buf = malloc (sizeof (buf[0]) * (buflen + 1));
661 if (!buf)
662 {
663 free (out);
664 return IDNA_MALLOC_ERROR;
665 }
666
667 /* don't check for non-malloc return codes as per
668 specification! */
669 rc = idna_to_unicode_44i (start, (size_t) (end - start),
670 buf, &buflen, flags);
671 if (rc == IDNA_MALLOC_ERROR)
672 {
673 free (out);
674 return IDNA_MALLOC_ERROR;
675 }
676
677 if (out)
678 {
679 uint32_t *newp = realloc (out,
680 sizeof (out[0])
681 * (outlen + 1 + buflen + 1));
682 if (!newp)
683 {
684 free (buf);
685 free (out);
686 return IDNA_MALLOC_ERROR;
687 }
688 out = newp;
689 out[outlen++] = 0x002E; /* '.' (full stop) */
690 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
691 outlen += buflen;
692 out[outlen] = 0x0;
693 free (buf);
694 }
695 else
696 {
697 out = buf;
698 outlen = buflen;
699 out[outlen] = 0x0;
700 }
701
702 start = end + 1;
703 }
704 while (*end);
705
706 *output = out;
707
708 return IDNA_SUCCESS;
709}
710
725int
726idna_to_unicode_8z4z (const char *input, uint32_t **output, int flags)
727{
728 uint32_t *ucs4;
729 size_t ucs4len;
730 int rc;
731
732 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
733 if (!ucs4)
734 return IDNA_ICONV_ERROR;
735
736 rc = idna_to_unicode_4z4z (ucs4, output, flags);
737 free (ucs4);
738
739 return rc;
740}
741
756int
757idna_to_unicode_8z8z (const char *input, char **output, int flags)
758{
759 uint32_t *ucs4;
760 int rc;
761
762 rc = idna_to_unicode_8z4z (input, &ucs4, flags);
763 if (rc != IDNA_SUCCESS)
764 return rc;
765
766 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
767 free (ucs4);
768
769 if (!*output)
770 return IDNA_ICONV_ERROR;
771
772 return IDNA_SUCCESS;
773}
774
790int
791idna_to_unicode_8zlz (const char *input, char **output, int flags)
792{
793 char *utf8;
794 int rc;
795
796 rc = idna_to_unicode_8z8z (input, &utf8, flags);
797 if (rc != IDNA_SUCCESS)
798 return rc;
799
800 *output = stringprep_utf8_to_locale (utf8);
801 free (utf8);
802
803 if (!*output)
804 return IDNA_ICONV_ERROR;
805
806 return IDNA_SUCCESS;
807}
808
825int
826idna_to_unicode_lzlz (const char *input, char **output, int flags)
827{
828 char *utf8;
829 int rc;
830
831 utf8 = stringprep_locale_to_utf8 (input);
832 if (!utf8)
833 return IDNA_ICONV_ERROR;
834
835 rc = idna_to_unicode_8zlz (utf8, output, flags);
836 free (utf8);
837
838 return rc;
839}
840
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition idna.c:791
#define DOTP(c)
Definition idna.c:44
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition idna.c:640
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition idna.c:576
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition idna.c:477
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition idna.c:826
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition idna.c:726
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition idna.c:437
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition idna.c:757
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition idna.c:81
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition idna.c:609
@ IDNA_ROUNDTRIP_VERIFY_ERROR
Definition idna.h:83
@ IDNA_PUNYCODE_ERROR
Definition idna.h:76
@ IDNA_SUCCESS
Definition idna.h:74
@ IDNA_NO_ACE_PREFIX
Definition idna.h:82
@ IDNA_CONTAINS_MINUS
Definition idna.h:80
@ IDNA_ICONV_ERROR
Definition idna.h:85
@ IDNA_STRINGPREP_ERROR
Definition idna.h:75
@ IDNA_CONTAINS_ACE_PREFIX
Definition idna.h:84
@ IDNA_CONTAINS_NON_LDH
Definition idna.h:77
@ IDNA_INVALID_LENGTH
Definition idna.h:81
@ IDNA_MALLOC_ERROR
Definition idna.h:87
@ IDNA_USE_STD3_ASCII_RULES
Definition idna.h:95
@ IDNA_ALLOW_UNASSIGNED
Definition idna.h:94
#define IDNA_ACE_PREFIX
Definition idna.h:99
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition nfkc.c:1019
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition nfkc.c:986
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition punycode.c:196
@ PUNYCODE_SUCCESS
Definition punycode.h:110
@ STRINGPREP_TOO_SMALL_BUFFER
Definition stringprep.h:75
@ STRINGPREP_OK
Definition stringprep.h:67
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition toutf8.c:161
#define stringprep_nameprep(in, maxlen)
Definition stringprep.h:202
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition toutf8.c:145
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition stringprep.h:205