PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, ssize_t len1,
86 const char *arg2, ssize_t len2,
88static size_t strnxfrm_libc(char *dest, size_t destsize,
89 const char *src, ssize_t srclen,
91extern char *get_collation_actual_version_libc(const char *collcollate);
92static locale_t make_libc_collator(const char *collate,
93 const char *ctype);
94
95#ifdef WIN32
96static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100
101static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 size_t fromlen, locale_t loc);
103
104static size_t strlower_libc_sb(char *dest, size_t destsize,
105 const char *src, ssize_t srclen,
107static size_t strlower_libc_mb(char *dest, size_t destsize,
108 const char *src, ssize_t srclen,
110static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
113static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 const char *src, ssize_t srclen,
116static size_t strupper_libc_sb(char *dest, size_t destsize,
117 const char *src, ssize_t srclen,
119static size_t strupper_libc_mb(char *dest, size_t destsize,
120 const char *src, ssize_t srclen,
122
123static bool
125{
126 return isdigit_l((unsigned char) wc, locale->lt);
127}
128
129static bool
131{
132 return isalpha_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalnum_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isupper_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return islower_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return isgraph_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isprint_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return ispunct_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return isspace_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180#ifndef WIN32
181 return isxdigit_l((unsigned char) wc, locale->lt);
182#else
183 return _isxdigit_l((unsigned char) wc, locale->lt);
184#endif
185}
186
187static bool
189{
190 return iswdigit_l((wint_t) wc, locale->lt);
191}
192
193static bool
195{
196 return iswalpha_l((wint_t) wc, locale->lt);
197}
198
199static bool
201{
202 return iswalnum_l((wint_t) wc, locale->lt);
203}
204
205static bool
207{
208 return iswupper_l((wint_t) wc, locale->lt);
209}
210
211static bool
213{
214 return iswlower_l((wint_t) wc, locale->lt);
215}
216
217static bool
219{
220 return iswgraph_l((wint_t) wc, locale->lt);
221}
222
223static bool
225{
226 return iswprint_l((wint_t) wc, locale->lt);
227}
228
229static bool
231{
232 return iswpunct_l((wint_t) wc, locale->lt);
233}
234
235static bool
237{
238 return iswspace_l((wint_t) wc, locale->lt);
239}
240
241static bool
243{
244#ifndef WIN32
245 return iswxdigit_l((wint_t) wc, locale->lt);
246#else
247 return _iswxdigit_l((wint_t) wc, locale->lt);
248#endif
249}
250
251static char
253{
255 return tolower_l(ch, locale->lt);
256}
257
258static bool
260{
261 bool is_multibyte = pg_database_encoding_max_length() > 1;
262
263 if (is_multibyte && IS_HIGHBIT_SET(ch))
264 return true;
265 else
266 return isalpha_l((unsigned char) ch, locale->lt);
267}
268
269static pg_wchar
271{
273
274 /* force C behavior for ASCII characters, per comments above */
275 if (locale->is_default && wc <= (pg_wchar) 127)
276 return pg_ascii_toupper((unsigned char) wc);
277 if (wc <= (pg_wchar) UCHAR_MAX)
278 return toupper_l((unsigned char) wc, locale->lt);
279 else
280 return wc;
281}
282
283static pg_wchar
285{
287
288 /* force C behavior for ASCII characters, per comments above */
289 if (locale->is_default && wc <= (pg_wchar) 127)
290 return pg_ascii_toupper((unsigned char) wc);
291 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
292 return towupper_l((wint_t) wc, locale->lt);
293 else
294 return wc;
295}
296
297static pg_wchar
299{
301
302 /* force C behavior for ASCII characters, per comments above */
303 if (locale->is_default && wc <= (pg_wchar) 127)
304 return pg_ascii_tolower((unsigned char) wc);
305 if (wc <= (pg_wchar) UCHAR_MAX)
306 return tolower_l((unsigned char) wc, locale->lt);
307 else
308 return wc;
309}
310
311static pg_wchar
313{
315
316 /* force C behavior for ASCII characters, per comments above */
317 if (locale->is_default && wc <= (pg_wchar) 127)
318 return pg_ascii_tolower((unsigned char) wc);
319 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
320 return towlower_l((wint_t) wc, locale->lt);
321 else
322 return wc;
323}
324
327 .strtitle = strtitle_libc_sb,
328 .strupper = strupper_libc_sb,
329 /* in libc, casefolding is the same as lowercasing */
330 .strfold = strlower_libc_sb,
331 .wc_isdigit = wc_isdigit_libc_sb,
332 .wc_isalpha = wc_isalpha_libc_sb,
333 .wc_isalnum = wc_isalnum_libc_sb,
334 .wc_isupper = wc_isupper_libc_sb,
335 .wc_islower = wc_islower_libc_sb,
336 .wc_isgraph = wc_isgraph_libc_sb,
337 .wc_isprint = wc_isprint_libc_sb,
338 .wc_ispunct = wc_ispunct_libc_sb,
339 .wc_isspace = wc_isspace_libc_sb,
340 .wc_isxdigit = wc_isxdigit_libc_sb,
341 .char_is_cased = char_is_cased_libc,
342 .char_tolower = char_tolower_libc,
343 .wc_toupper = toupper_libc_sb,
344 .wc_tolower = tolower_libc_sb,
345 .max_chr = UCHAR_MAX,
346};
347
348/*
349 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
350 * single-byte semantics for pattern matching.
351 */
354 .strtitle = strtitle_libc_mb,
355 .strupper = strupper_libc_mb,
356 /* in libc, casefolding is the same as lowercasing */
357 .strfold = strlower_libc_mb,
358 .wc_isdigit = wc_isdigit_libc_sb,
359 .wc_isalpha = wc_isalpha_libc_sb,
360 .wc_isalnum = wc_isalnum_libc_sb,
361 .wc_isupper = wc_isupper_libc_sb,
362 .wc_islower = wc_islower_libc_sb,
363 .wc_isgraph = wc_isgraph_libc_sb,
364 .wc_isprint = wc_isprint_libc_sb,
365 .wc_ispunct = wc_ispunct_libc_sb,
366 .wc_isspace = wc_isspace_libc_sb,
367 .wc_isxdigit = wc_isxdigit_libc_sb,
368 .char_is_cased = char_is_cased_libc,
369 .char_tolower = char_tolower_libc,
370 .wc_toupper = toupper_libc_sb,
371 .wc_tolower = tolower_libc_sb,
372 .max_chr = UCHAR_MAX,
373};
374
377 .strtitle = strtitle_libc_mb,
378 .strupper = strupper_libc_mb,
379 /* in libc, casefolding is the same as lowercasing */
380 .strfold = strlower_libc_mb,
381 .wc_isdigit = wc_isdigit_libc_mb,
382 .wc_isalpha = wc_isalpha_libc_mb,
383 .wc_isalnum = wc_isalnum_libc_mb,
384 .wc_isupper = wc_isupper_libc_mb,
385 .wc_islower = wc_islower_libc_mb,
386 .wc_isgraph = wc_isgraph_libc_mb,
387 .wc_isprint = wc_isprint_libc_mb,
388 .wc_ispunct = wc_ispunct_libc_mb,
389 .wc_isspace = wc_isspace_libc_mb,
390 .wc_isxdigit = wc_isxdigit_libc_mb,
391 .char_is_cased = char_is_cased_libc,
392 .char_tolower = char_tolower_libc,
393 .wc_toupper = toupper_libc_mb,
394 .wc_tolower = tolower_libc_mb,
395};
396
399 .strnxfrm = strnxfrm_libc,
400 .strnxfrm_prefix = NULL,
401
402 /*
403 * Unfortunately, it seems that strxfrm() for non-C collations is broken
404 * on many common platforms; testing of multiple versions of glibc reveals
405 * that, for many locales, strcoll() and strxfrm() do not return
406 * consistent results. While no other libc other than Cygwin has so far
407 * been shown to have a problem, we take the conservative course of action
408 * for right now and disable this categorically. (Users who are certain
409 * this isn't a problem on their system can define TRUST_STRXFRM.)
410 */
411#ifdef TRUST_STRXFRM
412 .strxfrm_is_safe = true,
413#else
414 .strxfrm_is_safe = false,
415#endif
416};
417
418#ifdef WIN32
419static const struct collate_methods collate_methods_libc_win32_utf8 = {
420 .strncoll = strncoll_libc_win32_utf8,
421 .strnxfrm = strnxfrm_libc,
422 .strnxfrm_prefix = NULL,
423#ifdef TRUST_STRXFRM
424 .strxfrm_is_safe = true,
425#else
426 .strxfrm_is_safe = false,
427#endif
428};
429#endif
430
431static size_t
432strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
434{
435 if (srclen < 0)
436 srclen = strlen(src);
437
438 if (srclen + 1 <= destsize)
439 {
440 locale_t loc = locale->lt;
441 char *p;
442
443 memcpy(dest, src, srclen);
444 dest[srclen] = '\0';
445
446 /*
447 * Note: we assume that tolower_l() will not be so broken as to need
448 * an isupper_l() guard test. When using the default collation, we
449 * apply the traditional Postgres behavior that forces ASCII-style
450 * treatment of I/i, but in non-default collations you get exactly
451 * what the collation says.
452 */
453 for (p = dest; *p; p++)
454 {
455 if (locale->is_default)
456 {
457 if (*p >= 'A' && *p <= 'Z')
458 *p += 'a' - 'A';
459 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
460 *p = tolower_l((unsigned char) *p, loc);
461 }
462 else
463 *p = tolower_l((unsigned char) *p, loc);
464 }
465 }
466
467 return srclen;
468}
469
470static size_t
471strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
473{
474 locale_t loc = locale->lt;
475 size_t result_size;
476 wchar_t *workspace;
477 char *result;
478 size_t curr_char;
479 size_t max_size;
480
481 if (srclen < 0)
482 srclen = strlen(src);
483
484 /* Overflow paranoia */
485 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
487 (errcode(ERRCODE_OUT_OF_MEMORY),
488 errmsg("out of memory")));
489
490 /* Output workspace cannot have more codes than input bytes */
491 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
492
493 char2wchar(workspace, srclen + 1, src, srclen, loc);
494
495 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
496 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
497
498 /*
499 * Make result large enough; case change might change number of bytes
500 */
501 max_size = curr_char * pg_database_encoding_max_length();
502 result = palloc(max_size + 1);
503
504 result_size = wchar2char(result, workspace, max_size + 1, loc);
505
506 if (result_size + 1 > destsize)
507 return result_size;
508
509 memcpy(dest, result, result_size);
510 dest[result_size] = '\0';
511
512 pfree(workspace);
513 pfree(result);
514
515 return result_size;
516}
517
518static size_t
519strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
521{
522 if (srclen < 0)
523 srclen = strlen(src);
524
525 if (srclen + 1 <= destsize)
526 {
527 locale_t loc = locale->lt;
528 int wasalnum = false;
529 char *p;
530
531 memcpy(dest, src, srclen);
532 dest[srclen] = '\0';
533
534 /*
535 * Note: we assume that toupper_l()/tolower_l() will not be so broken
536 * as to need guard tests. When using the default collation, we apply
537 * the traditional Postgres behavior that forces ASCII-style treatment
538 * of I/i, but in non-default collations you get exactly what the
539 * collation says.
540 */
541 for (p = dest; *p; p++)
542 {
543 if (locale->is_default)
544 {
545 if (wasalnum)
546 {
547 if (*p >= 'A' && *p <= 'Z')
548 *p += 'a' - 'A';
549 else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
550 *p = tolower_l((unsigned char) *p, loc);
551 }
552 else
553 {
554 if (*p >= 'a' && *p <= 'z')
555 *p -= 'a' - 'A';
556 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
557 *p = toupper_l((unsigned char) *p, loc);
558 }
559 }
560 else
561 {
562 if (wasalnum)
563 *p = tolower_l((unsigned char) *p, loc);
564 else
565 *p = toupper_l((unsigned char) *p, loc);
566 }
567 wasalnum = isalnum_l((unsigned char) *p, loc);
568 }
569 }
570
571 return srclen;
572}
573
574static size_t
575strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
577{
578 locale_t loc = locale->lt;
579 int wasalnum = false;
580 size_t result_size;
581 wchar_t *workspace;
582 char *result;
583 size_t curr_char;
584 size_t max_size;
585
586 if (srclen < 0)
587 srclen = strlen(src);
588
589 /* Overflow paranoia */
590 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
592 (errcode(ERRCODE_OUT_OF_MEMORY),
593 errmsg("out of memory")));
594
595 /* Output workspace cannot have more codes than input bytes */
596 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
597
598 char2wchar(workspace, srclen + 1, src, srclen, loc);
599
600 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
601 {
602 if (wasalnum)
603 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
604 else
605 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
606 wasalnum = iswalnum_l(workspace[curr_char], loc);
607 }
608
609 /*
610 * Make result large enough; case change might change number of bytes
611 */
612 max_size = curr_char * pg_database_encoding_max_length();
613 result = palloc(max_size + 1);
614
615 result_size = wchar2char(result, workspace, max_size + 1, loc);
616
617 if (result_size + 1 > destsize)
618 return result_size;
619
620 memcpy(dest, result, result_size);
621 dest[result_size] = '\0';
622
623 pfree(workspace);
624 pfree(result);
625
626 return result_size;
627}
628
629static size_t
630strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
632{
633 if (srclen < 0)
634 srclen = strlen(src);
635
636 if (srclen + 1 <= destsize)
637 {
638 locale_t loc = locale->lt;
639 char *p;
640
641 memcpy(dest, src, srclen);
642 dest[srclen] = '\0';
643
644 /*
645 * Note: we assume that toupper_l() will not be so broken as to need
646 * an islower_l() guard test. When using the default collation, we
647 * apply the traditional Postgres behavior that forces ASCII-style
648 * treatment of I/i, but in non-default collations you get exactly
649 * what the collation says.
650 */
651 for (p = dest; *p; p++)
652 {
653 if (locale->is_default)
654 {
655 if (*p >= 'a' && *p <= 'z')
656 *p -= 'a' - 'A';
657 else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
658 *p = toupper_l((unsigned char) *p, loc);
659 }
660 else
661 *p = toupper_l((unsigned char) *p, loc);
662 }
663 }
664
665 return srclen;
666}
667
668static size_t
669strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
671{
672 locale_t loc = locale->lt;
673 size_t result_size;
674 wchar_t *workspace;
675 char *result;
676 size_t curr_char;
677 size_t max_size;
678
679 if (srclen < 0)
680 srclen = strlen(src);
681
682 /* Overflow paranoia */
683 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
685 (errcode(ERRCODE_OUT_OF_MEMORY),
686 errmsg("out of memory")));
687
688 /* Output workspace cannot have more codes than input bytes */
689 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
690
691 char2wchar(workspace, srclen + 1, src, srclen, loc);
692
693 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
694 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
695
696 /*
697 * Make result large enough; case change might change number of bytes
698 */
699 max_size = curr_char * pg_database_encoding_max_length();
700 result = palloc(max_size + 1);
701
702 result_size = wchar2char(result, workspace, max_size + 1, loc);
703
704 if (result_size + 1 > destsize)
705 return result_size;
706
707 memcpy(dest, result, result_size);
708 dest[result_size] = '\0';
709
710 pfree(workspace);
711 pfree(result);
712
713 return result_size;
714}
715
718{
719 const char *collate;
720 const char *ctype;
721 locale_t loc;
722 pg_locale_t result;
723
724 if (collid == DEFAULT_COLLATION_OID)
725 {
726 HeapTuple tp;
727 Datum datum;
728
730 if (!HeapTupleIsValid(tp))
731 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
732 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
733 Anum_pg_database_datcollate);
734 collate = TextDatumGetCString(datum);
735 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
736 Anum_pg_database_datctype);
737 ctype = TextDatumGetCString(datum);
738
739 ReleaseSysCache(tp);
740 }
741 else
742 {
743 HeapTuple tp;
744 Datum datum;
745
747 if (!HeapTupleIsValid(tp))
748 elog(ERROR, "cache lookup failed for collation %u", collid);
749
750 datum = SysCacheGetAttrNotNull(COLLOID, tp,
751 Anum_pg_collation_collcollate);
752 collate = TextDatumGetCString(datum);
753 datum = SysCacheGetAttrNotNull(COLLOID, tp,
754 Anum_pg_collation_collctype);
755 ctype = TextDatumGetCString(datum);
756
757 ReleaseSysCache(tp);
758 }
759
760
761 loc = make_libc_collator(collate, ctype);
762
763 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
764 result->deterministic = true;
765 result->collate_is_c = (strcmp(collate, "C") == 0) ||
766 (strcmp(collate, "POSIX") == 0);
767 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
768 (strcmp(ctype, "POSIX") == 0);
769 result->lt = loc;
770 if (!result->collate_is_c)
771 {
772#ifdef WIN32
774 result->collate = &collate_methods_libc_win32_utf8;
775 else
776#endif
777 result->collate = &collate_methods_libc;
778 }
779 if (!result->ctype_is_c)
780 {
785 else
786 result->ctype = &ctype_methods_libc_sb;
787 }
788
789 return result;
790}
791
792/*
793 * Create a locale_t with the given collation and ctype.
794 *
795 * The "C" and "POSIX" locales are not actually handled by libc, so return
796 * NULL.
797 *
798 * Ensure that no path leaks a locale_t.
799 */
800static locale_t
801make_libc_collator(const char *collate, const char *ctype)
802{
803 locale_t loc = 0;
804
805 if (strcmp(collate, ctype) == 0)
806 {
807 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
808 {
809 /* Normal case where they're the same */
810 errno = 0;
811#ifndef WIN32
812 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
813 NULL);
814#else
815 loc = _create_locale(LC_ALL, collate);
816#endif
817 if (!loc)
819 }
820 }
821 else
822 {
823#ifndef WIN32
824 /* We need two newlocale() steps */
825 locale_t loc1 = 0;
826
827 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
828 {
829 errno = 0;
830 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
831 if (!loc1)
833 }
834
835 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
836 {
837 errno = 0;
838 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
839 if (!loc)
840 {
841 if (loc1)
842 freelocale(loc1);
844 }
845 }
846 else
847 loc = loc1;
848#else
849
850 /*
851 * XXX The _create_locale() API doesn't appear to support this. Could
852 * perhaps be worked around by changing pg_locale_t to contain two
853 * separate fields.
854 */
856 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
857 errmsg("collations with different collate and ctype values are not supported on this platform")));
858#endif
859 }
860
861 return loc;
862}
863
864/*
865 * strncoll_libc
866 *
867 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
868 *
869 * An input string length of -1 means that it's already NUL-terminated.
870 */
871int
872strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
874{
875 char sbuf[TEXTBUFLEN];
876 char *buf = sbuf;
877 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
878 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
879 const char *arg1n;
880 const char *arg2n;
881 int result;
882
883 if (bufsize1 + bufsize2 > TEXTBUFLEN)
884 buf = palloc(bufsize1 + bufsize2);
885
886 /* nul-terminate arguments if necessary */
887 if (len1 == -1)
888 {
889 arg1n = arg1;
890 }
891 else
892 {
893 char *buf1 = buf;
894
895 memcpy(buf1, arg1, len1);
896 buf1[len1] = '\0';
897 arg1n = buf1;
898 }
899
900 if (len2 == -1)
901 {
902 arg2n = arg2;
903 }
904 else
905 {
906 char *buf2 = buf + bufsize1;
907
908 memcpy(buf2, arg2, len2);
909 buf2[len2] = '\0';
910 arg2n = buf2;
911 }
912
913 result = strcoll_l(arg1n, arg2n, locale->lt);
914
915 if (buf != sbuf)
916 pfree(buf);
917
918 return result;
919}
920
921/*
922 * strnxfrm_libc
923 *
924 * NUL-terminate src, if necessary, and pass to strxfrm_l().
925 *
926 * A source length of -1 means that it's already NUL-terminated.
927 */
928size_t
929strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
931{
932 char sbuf[TEXTBUFLEN];
933 char *buf = sbuf;
934 size_t bufsize = srclen + 1;
935 size_t result;
936
937 if (srclen == -1)
938 return strxfrm_l(dest, src, destsize, locale->lt);
939
940 if (bufsize > TEXTBUFLEN)
941 buf = palloc(bufsize);
942
943 /* nul-terminate argument */
944 memcpy(buf, src, srclen);
945 buf[srclen] = '\0';
946
947 result = strxfrm_l(dest, buf, destsize, locale->lt);
948
949 if (buf != sbuf)
950 pfree(buf);
951
952 /* if dest is defined, it should be nul-terminated */
953 Assert(result >= destsize || dest[result] == '\0');
954
955 return result;
956}
957
958char *
959get_collation_actual_version_libc(const char *collcollate)
960{
961 char *collversion = NULL;
962
963 if (pg_strcasecmp("C", collcollate) != 0 &&
964 pg_strncasecmp("C.", collcollate, 2) != 0 &&
965 pg_strcasecmp("POSIX", collcollate) != 0)
966 {
967#if defined(__GLIBC__)
968 /* Use the glibc version because we don't have anything better. */
969 collversion = pstrdup(gnu_get_libc_version());
970#elif defined(LC_VERSION_MASK)
971 locale_t loc;
972
973 /* Look up FreeBSD collation version. */
974 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
975 if (loc)
976 {
977 collversion =
978 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
979 freelocale(loc);
980 }
981 else
983 (errmsg("could not load locale \"%s\"", collcollate)));
984#elif defined(WIN32)
985 /*
986 * If we are targeting Windows Vista and above, we can ask for a name
987 * given a collation name (earlier versions required a location code
988 * that we don't have).
989 */
990 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
991 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
992
993 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
994 LOCALE_NAME_MAX_LENGTH);
995 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
996 {
997 /*
998 * GetNLSVersionEx() wants a language tag such as "en-US", not a
999 * locale name like "English_United States.1252". Until those
1000 * values can be prevented from entering the system, or 100%
1001 * reliably converted to the more useful tag format, tolerate the
1002 * resulting error and report that we have no version data.
1003 */
1004 if (GetLastError() == ERROR_INVALID_PARAMETER)
1005 return NULL;
1006
1007 ereport(ERROR,
1008 (errmsg("could not get collation version for locale \"%s\": error code %lu",
1009 collcollate,
1010 GetLastError())));
1011 }
1012 collversion = psprintf("%lu.%lu,%lu.%lu",
1013 (version.dwNLSVersion >> 8) & 0xFFFF,
1014 version.dwNLSVersion & 0xFF,
1015 (version.dwDefinedVersion >> 8) & 0xFFFF,
1016 version.dwDefinedVersion & 0xFF);
1017#endif
1018 }
1019
1020 return collversion;
1021}
1022
1023/*
1024 * strncoll_libc_win32_utf8
1025 *
1026 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1027 * invoke wcscoll_l().
1028 *
1029 * An input string length of -1 means that it's NUL-terminated.
1030 */
1031#ifdef WIN32
1032static int
1033strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1034 ssize_t len2, pg_locale_t locale)
1035{
1036 char sbuf[TEXTBUFLEN];
1037 char *buf = sbuf;
1038 char *a1p,
1039 *a2p;
1040 int a1len;
1041 int a2len;
1042 int r;
1043 int result;
1044
1046
1047 if (len1 == -1)
1048 len1 = strlen(arg1);
1049 if (len2 == -1)
1050 len2 = strlen(arg2);
1051
1052 a1len = len1 * 2 + 2;
1053 a2len = len2 * 2 + 2;
1054
1055 if (a1len + a2len > TEXTBUFLEN)
1056 buf = palloc(a1len + a2len);
1057
1058 a1p = buf;
1059 a2p = buf + a1len;
1060
1061 /* API does not work for zero-length input */
1062 if (len1 == 0)
1063 r = 0;
1064 else
1065 {
1066 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1067 (LPWSTR) a1p, a1len / 2);
1068 if (!r)
1069 ereport(ERROR,
1070 (errmsg("could not convert string to UTF-16: error code %lu",
1071 GetLastError())));
1072 }
1073 ((LPWSTR) a1p)[r] = 0;
1074
1075 if (len2 == 0)
1076 r = 0;
1077 else
1078 {
1079 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1080 (LPWSTR) a2p, a2len / 2);
1081 if (!r)
1082 ereport(ERROR,
1083 (errmsg("could not convert string to UTF-16: error code %lu",
1084 GetLastError())));
1085 }
1086 ((LPWSTR) a2p)[r] = 0;
1087
1088 errno = 0;
1089 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1090 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1091 ereport(ERROR,
1092 (errmsg("could not compare Unicode strings: %m")));
1093
1094 if (buf != sbuf)
1095 pfree(buf);
1096
1097 return result;
1098}
1099#endif /* WIN32 */
1100
1101/* simple subroutine for reporting errors from newlocale() */
1102void
1103report_newlocale_failure(const char *localename)
1104{
1105 int save_errno;
1106
1107 /*
1108 * Windows doesn't provide any useful error indication from
1109 * _create_locale(), and BSD-derived platforms don't seem to feel they
1110 * need to set errno either (even though POSIX is pretty clear that
1111 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1112 * is what to report.
1113 */
1114 if (errno == 0)
1115 errno = ENOENT;
1116
1117 /*
1118 * ENOENT means "no such locale", not "no such file", so clarify that
1119 * errno with an errdetail message.
1120 */
1121 save_errno = errno; /* auxiliary funcs might change errno */
1122 ereport(ERROR,
1123 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1124 errmsg("could not create locale \"%s\": %m",
1125 localename),
1126 (save_errno == ENOENT ?
1127 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1128 localename) : 0)));
1129}
1130
1131/*
1132 * POSIX doesn't define _l-variants of these functions, but several systems
1133 * have them. We provide our own replacements here.
1134 */
1135#ifndef HAVE_MBSTOWCS_L
1136static size_t
1137mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1138{
1139#ifdef WIN32
1140 return _mbstowcs_l(dest, src, n, loc);
1141#else
1142 size_t result;
1143 locale_t save_locale = uselocale(loc);
1144
1145 result = mbstowcs(dest, src, n);
1146 uselocale(save_locale);
1147 return result;
1148#endif
1149}
1150#endif
1151#ifndef HAVE_WCSTOMBS_L
1152static size_t
1153wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1154{
1155#ifdef WIN32
1156 return _wcstombs_l(dest, src, n, loc);
1157#else
1158 size_t result;
1159 locale_t save_locale = uselocale(loc);
1160
1161 result = wcstombs(dest, src, n);
1162 uselocale(save_locale);
1163 return result;
1164#endif
1165}
1166#endif
1167
1168/*
1169 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1170 * Therefore we keep them here rather than with the mbutils code.
1171 */
1172
1173/*
1174 * wchar2char --- convert wide characters to multibyte format
1175 *
1176 * This has the same API as the standard wcstombs_l() function; in particular,
1177 * tolen is the maximum number of bytes to store at *to, and *from must be
1178 * zero-terminated. The output will be zero-terminated iff there is room.
1179 */
1180size_t
1181wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1182{
1183 size_t result;
1184
1185 if (tolen == 0)
1186 return 0;
1187
1188#ifdef WIN32
1189
1190 /*
1191 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1192 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1193 * MultiByteToWideChar().
1194 */
1196 {
1197 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1198 NULL, NULL);
1199 /* A zero return is failure */
1200 if (result <= 0)
1201 result = -1;
1202 else
1203 {
1204 Assert(result <= tolen);
1205 /* Microsoft counts the zero terminator in the result */
1206 result--;
1207 }
1208 }
1209 else
1210#endif /* WIN32 */
1211 if (loc == (locale_t) 0)
1212 {
1213 /* Use wcstombs directly for the default locale */
1214 result = wcstombs(to, from, tolen);
1215 }
1216 else
1217 {
1218 /* Use wcstombs_l for nondefault locales */
1219 result = wcstombs_l(to, from, tolen, loc);
1220 }
1221
1222 return result;
1223}
1224
1225/*
1226 * char2wchar --- convert multibyte characters to wide characters
1227 *
1228 * This has almost the API of mbstowcs_l(), except that *from need not be
1229 * null-terminated; instead, the number of input bytes is specified as
1230 * fromlen. Also, we ereport() rather than returning -1 for invalid
1231 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1232 * The output will be zero-terminated iff there is room.
1233 */
1234static size_t
1235char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1236 locale_t loc)
1237{
1238 size_t result;
1239
1240 if (tolen == 0)
1241 return 0;
1242
1243#ifdef WIN32
1244 /* See WIN32 "Unicode" comment above */
1246 {
1247 /* Win32 API does not work for zero-length input */
1248 if (fromlen == 0)
1249 result = 0;
1250 else
1251 {
1252 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1253 /* A zero return is failure */
1254 if (result == 0)
1255 result = -1;
1256 }
1257
1258 if (result != -1)
1259 {
1260 Assert(result < tolen);
1261 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1262 to[result] = 0;
1263 }
1264 }
1265 else
1266#endif /* WIN32 */
1267 {
1268 /* mbstowcs requires ending '\0' */
1269 char *str = pnstrdup(from, fromlen);
1270
1271 if (loc == (locale_t) 0)
1272 {
1273 /* Use mbstowcs directly for the default locale */
1274 result = mbstowcs(to, str, tolen);
1275 }
1276 else
1277 {
1278 /* Use mbstowcs_l for nondefault locales */
1279 result = mbstowcs_l(to, str, tolen, loc);
1280 }
1281
1282 pfree(str);
1283 }
1284
1285 if (result == -1)
1286 {
1287 /*
1288 * Invalid multibyte character encountered. We try to give a useful
1289 * error message by letting pg_verifymbstr check the string. But it's
1290 * possible that the string is OK to us, and not OK to mbstowcs ---
1291 * this suggests that the LC_CTYPE locale is different from the
1292 * database encoding. Give a generic error message if pg_verifymbstr
1293 * can't find anything wrong.
1294 */
1295 pg_verifymbstr(from, fromlen, false); /* might not return */
1296 /* but if it does ... */
1297 ereport(ERROR,
1298 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1299 errmsg("invalid multibyte character for locale"),
1300 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1301 }
1302
1303 return result;
1304}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1145
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1262
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1557
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1547
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1770
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static char char_tolower_libc(unsigned char ch, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:32
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition: port.h:188
static unsigned char pg_ascii_toupper(unsigned char ch)
Definition: port.h:177
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:65
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:75
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:101
const struct ctype_methods * ctype
Definition: pg_locale.h:167
const struct collate_methods * collate
Definition: pg_locale.h:166
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:432
#define toupper_l
Definition: win32_port.h:434
#define iswalnum_l
Definition: win32_port.h:442
#define isgraph_l
Definition: win32_port.h:447
#define towupper_l
Definition: win32_port.h:436
#define ispunct_l
Definition: win32_port.h:451
#define isalpha_l
Definition: win32_port.h:439
#define strcoll_l
Definition: win32_port.h:455
#define iswgraph_l
Definition: win32_port.h:448
#define strxfrm_l
Definition: win32_port.h:456
#define towlower_l
Definition: win32_port.h:435
#define iswspace_l
Definition: win32_port.h:454
#define isdigit_l
Definition: win32_port.h:437
#define wcscoll_l
Definition: win32_port.h:457
#define tolower_l
Definition: win32_port.h:433
#define iswupper_l
Definition: win32_port.h:444
#define iswalpha_l
Definition: win32_port.h:440
#define isprint_l
Definition: win32_port.h:449
#define iswprint_l
Definition: win32_port.h:450
#define isupper_l
Definition: win32_port.h:443
#define isalnum_l
Definition: win32_port.h:441
#define islower_l
Definition: win32_port.h:445
#define iswlower_l
Definition: win32_port.h:446
#define iswpunct_l
Definition: win32_port.h:452
#define isspace_l
Definition: win32_port.h:453
#define iswdigit_l
Definition: win32_port.h:438