Skip to content

Commit e163079

Browse files
committed
Correction of unicode block range bug
1 parent 9357fb5 commit e163079

2 files changed

Lines changed: 114 additions & 103 deletions

File tree

src/ly_common.c

Lines changed: 109 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -724,93 +724,97 @@ ly_pat_compile_posix(const char *pattern, void **pat_comp, struct ly_err_item **
724724
static LY_ERR
725725
ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **regex, struct ly_err_item **err)
726726
{
727-
#define URANGE_LEN 19
728-
char *ublock2urange[][2] = {
729-
{"BasicLatin", "[\\x{0000}-\\x{007F}]"},
730-
{"Latin-1Supplement", "[\\x{0080}-\\x{00FF}]"},
731-
{"LatinExtended-A", "[\\x{0100}-\\x{017F}]"},
732-
{"LatinExtended-B", "[\\x{0180}-\\x{024F}]"},
733-
{"IPAExtensions", "[\\x{0250}-\\x{02AF}]"},
734-
{"SpacingModifierLetters", "[\\x{02B0}-\\x{02FF}]"},
735-
{"CombiningDiacriticalMarks", "[\\x{0300}-\\x{036F}]"},
736-
{"Greek", "[\\x{0370}-\\x{03FF}]"},
737-
{"Cyrillic", "[\\x{0400}-\\x{04FF}]"},
738-
{"Armenian", "[\\x{0530}-\\x{058F}]"},
739-
{"Hebrew", "[\\x{0590}-\\x{05FF}]"},
740-
{"Arabic", "[\\x{0600}-\\x{06FF}]"},
741-
{"Syriac", "[\\x{0700}-\\x{074F}]"},
742-
{"Thaana", "[\\x{0780}-\\x{07BF}]"},
743-
{"Devanagari", "[\\x{0900}-\\x{097F}]"},
744-
{"Bengali", "[\\x{0980}-\\x{09FF}]"},
745-
{"Gurmukhi", "[\\x{0A00}-\\x{0A7F}]"},
746-
{"Gujarati", "[\\x{0A80}-\\x{0AFF}]"},
747-
{"Oriya", "[\\x{0B00}-\\x{0B7F}]"},
748-
{"Tamil", "[\\x{0B80}-\\x{0BFF}]"},
749-
{"Telugu", "[\\x{0C00}-\\x{0C7F}]"},
750-
{"Kannada", "[\\x{0C80}-\\x{0CFF}]"},
751-
{"Malayalam", "[\\x{0D00}-\\x{0D7F}]"},
752-
{"Sinhala", "[\\x{0D80}-\\x{0DFF}]"},
753-
{"Thai", "[\\x{0E00}-\\x{0E7F}]"},
754-
{"Lao", "[\\x{0E80}-\\x{0EFF}]"},
755-
{"Tibetan", "[\\x{0F00}-\\x{0FFF}]"},
756-
{"Myanmar", "[\\x{1000}-\\x{109F}]"},
757-
{"Georgian", "[\\x{10A0}-\\x{10FF}]"},
758-
{"HangulJamo", "[\\x{1100}-\\x{11FF}]"},
759-
{"Ethiopic", "[\\x{1200}-\\x{137F}]"},
760-
{"Cherokee", "[\\x{13A0}-\\x{13FF}]"},
761-
{"UnifiedCanadianAboriginalSyllabics", "[\\x{1400}-\\x{167F}]"},
762-
{"Ogham", "[\\x{1680}-\\x{169F}]"},
763-
{"Runic", "[\\x{16A0}-\\x{16FF}]"},
764-
{"Khmer", "[\\x{1780}-\\x{17FF}]"},
765-
{"Mongolian", "[\\x{1800}-\\x{18AF}]"},
766-
{"LatinExtendedAdditional", "[\\x{1E00}-\\x{1EFF}]"},
767-
{"GreekExtended", "[\\x{1F00}-\\x{1FFF}]"},
768-
{"GeneralPunctuation", "[\\x{2000}-\\x{206F}]"},
769-
{"SuperscriptsandSubscripts", "[\\x{2070}-\\x{209F}]"},
770-
{"CurrencySymbols", "[\\x{20A0}-\\x{20CF}]"},
771-
{"CombiningMarksforSymbols", "[\\x{20D0}-\\x{20FF}]"},
772-
{"LetterlikeSymbols", "[\\x{2100}-\\x{214F}]"},
773-
{"NumberForms", "[\\x{2150}-\\x{218F}]"},
774-
{"Arrows", "[\\x{2190}-\\x{21FF}]"},
775-
{"MathematicalOperators", "[\\x{2200}-\\x{22FF}]"},
776-
{"MiscellaneousTechnical", "[\\x{2300}-\\x{23FF}]"},
777-
{"ControlPictures", "[\\x{2400}-\\x{243F}]"},
778-
{"OpticalCharacterRecognition", "[\\x{2440}-\\x{245F}]"},
779-
{"EnclosedAlphanumerics", "[\\x{2460}-\\x{24FF}]"},
780-
{"BoxDrawing", "[\\x{2500}-\\x{257F}]"},
781-
{"BlockElements", "[\\x{2580}-\\x{259F}]"},
782-
{"GeometricShapes", "[\\x{25A0}-\\x{25FF}]"},
783-
{"MiscellaneousSymbols", "[\\x{2600}-\\x{26FF}]"},
784-
{"Dingbats", "[\\x{2700}-\\x{27BF}]"},
785-
{"BraillePatterns", "[\\x{2800}-\\x{28FF}]"},
786-
{"CJKRadicalsSupplement", "[\\x{2E80}-\\x{2EFF}]"},
787-
{"KangxiRadicals", "[\\x{2F00}-\\x{2FDF}]"},
788-
{"IdeographicDescriptionCharacters", "[\\x{2FF0}-\\x{2FFF}]"},
789-
{"CJKSymbolsandPunctuation", "[\\x{3000}-\\x{303F}]"},
790-
{"Hiragana", "[\\x{3040}-\\x{309F}]"},
791-
{"Katakana", "[\\x{30A0}-\\x{30FF}]"},
792-
{"Bopomofo", "[\\x{3100}-\\x{312F}]"},
793-
{"HangulCompatibilityJamo", "[\\x{3130}-\\x{318F}]"},
794-
{"Kanbun", "[\\x{3190}-\\x{319F}]"},
795-
{"BopomofoExtended", "[\\x{31A0}-\\x{31BF}]"},
796-
{"EnclosedCJKLettersandMonths", "[\\x{3200}-\\x{32FF}]"},
797-
{"CJKCompatibility", "[\\x{3300}-\\x{33FF}]"},
798-
{"CJKUnifiedIdeographsExtensionA", "[\\x{3400}-\\x{4DB5}]"},
799-
{"CJKUnifiedIdeographs", "[\\x{4E00}-\\x{9FFF}]"},
800-
{"YiSyllables", "[\\x{A000}-\\x{A48F}]"},
801-
{"YiRadicals", "[\\x{A490}-\\x{A4CF}]"},
802-
{"HangulSyllables", "[\\x{AC00}-\\x{D7A3}]"},
803-
{"PrivateUse", "[\\x{E000}-\\x{F8FF}]"},
804-
{"CJKCompatibilityIdeographs", "[\\x{F900}-\\x{FAFF}]"},
805-
{"AlphabeticPresentationForms", "[\\x{FB00}-\\x{FB4F}]"},
806-
{"ArabicPresentationForms-A", "[\\x{FB50}-\\x{FDFF}]"},
807-
{"CombiningHalfMarks", "[\\x{FE20}-\\x{FE2F}]"},
808-
{"CJKCompatibilityForms", "[\\x{FE30}-\\x{FE4F}]"},
809-
{"SmallFormVariants", "[\\x{FE50}-\\x{FE6F}]"},
810-
{"ArabicPresentationForms-B", "[\\x{FE70}-\\x{FEFE}]"},
811-
{"HalfwidthandFullwidthForms", "[\\x{FF00}-\\x{FFEF}]"},
812-
{"Specials", "[\\x{FEFF}|\\x{FFF0}-\\x{FFFD}]"},
813-
{NULL, NULL}
727+
struct ublock_s {
728+
char *ublock;
729+
char *urange;
730+
size_t size;
731+
};
732+
struct ublock_s ublock2urange[] = {
733+
{"BasicLatin", "[\\x{0000}-\\x{007F}]",19},
734+
{"Latin-1Supplement", "[\\x{0080}-\\x{00FF}]", 19},
735+
{"LatinExtended-A", "[\\x{0100}-\\x{017F}]", 19},
736+
{"LatinExtended-B", "[\\x{0180}-\\x{024F}]", 19},
737+
{"IPAExtensions", "[\\x{0250}-\\x{02AF}]", 19},
738+
{"SpacingModifierLetters", "[\\x{02B0}-\\x{02FF}]", 19},
739+
{"CombiningDiacriticalMarks", "[\\x{0300}-\\x{036F}]", 19},
740+
{"Greek", "[\\x{0370}-\\x{03FF}]", 19},
741+
{"Cyrillic", "[\\x{0400}-\\x{04FF}]", 19},
742+
{"Armenian", "[\\x{0530}-\\x{058F}]", 19},
743+
{"Hebrew", "[\\x{0590}-\\x{05FF}]", 19},
744+
{"Arabic", "[\\x{0600}-\\x{06FF}]", 19},
745+
{"Syriac", "[\\x{0700}-\\x{074F}]", 19},
746+
{"Thaana", "[\\x{0780}-\\x{07BF}]", 19},
747+
{"Devanagari", "[\\x{0900}-\\x{097F}]", 19},
748+
{"Bengali", "[\\x{0980}-\\x{09FF}]", 19},
749+
{"Gurmukhi", "[\\x{0A00}-\\x{0A7F}]", 19},
750+
{"Gujarati", "[\\x{0A80}-\\x{0AFF}]", 19},
751+
{"Oriya", "[\\x{0B00}-\\x{0B7F}]", 19},
752+
{"Tamil", "[\\x{0B80}-\\x{0BFF}]", 19},
753+
{"Telugu", "[\\x{0C00}-\\x{0C7F}]", 19},
754+
{"Kannada", "[\\x{0C80}-\\x{0CFF}]", 19},
755+
{"Malayalam", "[\\x{0D00}-\\x{0D7F}]", 19},
756+
{"Sinhala", "[\\x{0D80}-\\x{0DFF}]", 19},
757+
{"Thai", "[\\x{0E00}-\\x{0E7F}]", 19},
758+
{"Lao", "[\\x{0E80}-\\x{0EFF}]", 19},
759+
{"Tibetan", "[\\x{0F00}-\\x{0FFF}]", 19},
760+
{"Myanmar", "[\\x{1000}-\\x{109F}]", 19},
761+
{"Georgian", "[\\x{10A0}-\\x{10FF}]", 19},
762+
{"HangulJamo", "[\\x{1100}-\\x{11FF}]", 19},
763+
{"Ethiopic", "[\\x{1200}-\\x{137F}]", 19},
764+
{"Cherokee", "[\\x{13A0}-\\x{13FF}]", 19},
765+
{"UnifiedCanadianAboriginalSyllabics", "[\\x{1400}-\\x{167F}]", 19},
766+
{"Ogham", "[\\x{1680}-\\x{169F}]", 19},
767+
{"Runic", "[\\x{16A0}-\\x{16FF}]", 19},
768+
{"Khmer", "[\\x{1780}-\\x{17FF}]", 19},
769+
{"Mongolian", "[\\x{1800}-\\x{18AF}]", 19},
770+
{"LatinExtendedAdditional", "[\\x{1E00}-\\x{1EFF}]", 19},
771+
{"GreekExtended", "[\\x{1F00}-\\x{1FFF}]", 19},
772+
{"GeneralPunctuation", "[\\x{2000}-\\x{206F}]", 19},
773+
{"SuperscriptsandSubscripts", "[\\x{2070}-\\x{209F}]", 19},
774+
{"CurrencySymbols", "[\\x{20A0}-\\x{20CF}]", 19},
775+
{"CombiningMarksforSymbols", "[\\x{20D0}-\\x{20FF}]", 19},
776+
{"LetterlikeSymbols", "[\\x{2100}-\\x{214F}]", 19},
777+
{"NumberForms", "[\\x{2150}-\\x{218F}]", 19},
778+
{"Arrows", "[\\x{2190}-\\x{21FF}]", 19},
779+
{"MathematicalOperators", "[\\x{2200}-\\x{22FF}]", 19},
780+
{"MiscellaneousTechnical", "[\\x{2300}-\\x{23FF}]", 19},
781+
{"ControlPictures", "[\\x{2400}-\\x{243F}]", 19},
782+
{"OpticalCharacterRecognition", "[\\x{2440}-\\x{245F}]", 19},
783+
{"EnclosedAlphanumerics", "[\\x{2460}-\\x{24FF}]", 19},
784+
{"BoxDrawing", "[\\x{2500}-\\x{257F}]", 19},
785+
{"BlockElements", "[\\x{2580}-\\x{259F}]", 19},
786+
{"GeometricShapes", "[\\x{25A0}-\\x{25FF}]", 19},
787+
{"MiscellaneousSymbols", "[\\x{2600}-\\x{26FF}]", 19},
788+
{"Dingbats", "[\\x{2700}-\\x{27BF}]", 19},
789+
{"BraillePatterns", "[\\x{2800}-\\x{28FF}]", 19},
790+
{"CJKRadicalsSupplement", "[\\x{2E80}-\\x{2EFF}]", 19},
791+
{"KangxiRadicals", "[\\x{2F00}-\\x{2FDF}]", 19},
792+
{"IdeographicDescriptionCharacters", "[\\x{2FF0}-\\x{2FFF}]", 19},
793+
{"CJKSymbolsandPunctuation", "[\\x{3000}-\\x{303F}]", 19},
794+
{"Hiragana", "[\\x{3040}-\\x{309F}]", 19},
795+
{"Katakana", "[\\x{30A0}-\\x{30FF}]", 19},
796+
{"Bopomofo", "[\\x{3100}-\\x{312F}]", 19},
797+
{"HangulCompatibilityJamo", "[\\x{3130}-\\x{318F}]", 19},
798+
{"Kanbun", "[\\x{3190}-\\x{319F}]", 19},
799+
{"BopomofoExtended", "[\\x{31A0}-\\x{31BF}]", 19},
800+
{"EnclosedCJKLettersandMonths", "[\\x{3200}-\\x{32FF}]", 19},
801+
{"CJKCompatibility", "[\\x{3300}-\\x{33FF}]", 19},
802+
{"CJKUnifiedIdeographsExtensionA", "[\\x{3400}-\\x{4DB5}]", 19},
803+
{"CJKUnifiedIdeographs", "[\\x{4E00}-\\x{9FFF}]", 19},
804+
{"YiSyllables", "[\\x{A000}-\\x{A48F}]", 19},
805+
{"YiRadicals", "[\\x{A490}-\\x{A4CF}]", 19},
806+
{"HangulSyllables", "[\\x{AC00}-\\x{D7A3}]", 19},
807+
{"PrivateUse", "[\\x{E000}-\\x{F8FF}]", 19},
808+
{"CJKCompatibilityIdeographs", "[\\x{F900}-\\x{FAFF}]", 19},
809+
{"AlphabeticPresentationForms", "[\\x{FB00}-\\x{FB4F}]", 19},
810+
{"ArabicPresentationForms-A", "[\\x{FB50}-\\x{FDFF}]", 19},
811+
{"CombiningHalfMarks", "[\\x{FE20}-\\x{FE2F}]", 19},
812+
{"CJKCompatibilityForms", "[\\x{FE30}-\\x{FE4F}]", 19},
813+
{"SmallFormVariants", "[\\x{FE50}-\\x{FE6F}]", 19},
814+
{"ArabicPresentationForms-B", "[\\x{FE70}-\\x{FEFE}]", 19},
815+
{"HalfwidthandFullwidthForms", "[\\x{FF00}-\\x{FFEF}]", 19},
816+
{"Specials", "[\\x{FEFF}|\\x{FFF0}-\\x{FFFD}]", 28},
817+
{NULL, NULL, 0}
814818
};
815819

816820
size_t idx, idx2, start, end, ublock;
@@ -829,28 +833,31 @@ ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **reg
829833
}
830834
end = (ptr - perl_regex) + 1;
831835

832-
/* need more space */
833-
if (end - start < URANGE_LEN) {
834-
perl_regex = ly_realloc(perl_regex, strlen(perl_regex) + (URANGE_LEN - (end - start)) + 1);
835-
*regex = perl_regex;
836-
if (!perl_regex) {
837-
return ly_err_new(err, LY_EMEM, 0, NULL, NULL, LY_EMEM_MSG);
838-
}
839-
}
840836

841837
/* find our range */
842-
for (idx = 0; ublock2urange[idx][0]; ++idx) {
838+
for (idx = 0; ublock2urange[idx].ublock; ++idx) {
843839
if (!strncmp(perl_regex + start + ly_strlen_const("\\p{Is"),
844-
ublock2urange[idx][0], strlen(ublock2urange[idx][0]))) {
840+
ublock2urange[idx].ublock, strlen(ublock2urange[idx].ublock))) {
845841
break;
846842
}
847843
}
848-
if (!ublock2urange[idx][0]) {
844+
if (!ublock2urange[idx].ublock) {
849845
return ly_err_new(err, LY_EVALID, 0, NULL, NULL, "Regular expression \"%s\" is not valid (\"%s\": %s).",
850846
pattern, perl_regex + start + 5, "unknown block name");
851847
}
852848
ublock = idx;
853849

850+
/* need more space */
851+
size_t urange_len = ublock2urange[ublock].size;
852+
if(end - start < urange_len ) {
853+
perl_regex = ly_realloc(perl_regex, strlen(perl_regex) + (urange_len - (end - start)) + 1);
854+
*regex = perl_regex;
855+
if (!perl_regex) {
856+
return ly_err_new(err, LY_EMEM, 0, NULL, NULL, LY_EMEM_MSG);
857+
}
858+
}
859+
860+
854861
/* make the space in the string and replace the block (but we cannot include brackets if it was already enclosed in them) */
855862
for (idx2 = 0, idx = 0; idx2 < start; ++idx2) {
856863
if ((perl_regex[idx2] == '[') && (!idx2 || (perl_regex[idx2 - 1] != '\\'))) {
@@ -863,11 +870,11 @@ ly_pat_compile_xmlschema_chblocks_xmlschema2perl(const char *pattern, char **reg
863870
}
864871
if (idx) {
865872
/* skip brackets */
866-
memmove(perl_regex + start + (URANGE_LEN - 2), perl_regex + end, strlen(perl_regex + end) + 1);
867-
memcpy(perl_regex + start, ublock2urange[ublock][1] + 1, URANGE_LEN - 2);
873+
memmove(perl_regex + start + (urange_len - 2), perl_regex + end, strlen(perl_regex + end) + 1);
874+
memcpy(perl_regex + start, ublock2urange[ublock].urange + 1, urange_len - 2);
868875
} else {
869-
memmove(perl_regex + start + URANGE_LEN, perl_regex + end, strlen(perl_regex + end) + 1);
870-
memcpy(perl_regex + start, ublock2urange[ublock][1], URANGE_LEN);
876+
memmove(perl_regex + start + urange_len, perl_regex + end, strlen(perl_regex + end) + 1);
877+
memcpy(perl_regex + start, ublock2urange[ublock].urange, urange_len);
871878
}
872879
}
873880

@@ -1112,7 +1119,6 @@ ly_pat_match_xmlschema(const void *pat_comp, const char *pattern, const char *st
11121119
int r, match_opts = 0;
11131120
pcre2_code *pcode = (void *)pat_comp;
11141121
pcre2_match_data *match_data = NULL;
1115-
11161122
if (!pat_comp) {
11171123
/* compile pattern first */
11181124
rc = ly_pat_compile_xmlschema(pattern, (void **)&pcode, err);

tests/utests/types/string.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,11 @@ test_data_xml(void **state)
863863
UTEST_INVALID_MODULE(schema, LYS_IN_YANG, NULL, LY_EVALID);
864864
CHECK_LOG_CTX("Regular expression \"[\\p{IsBasicLatin}\\p{IsUnknownUnicodeBlock}]+\" "
865865
"is not valid (\"UnknownUnicodeBlock}]+\": unknown block name).", "/T_UB_8:port", 0);
866+
867+
schema = MODULE_CREATE_YANG("T_UB_9", "leaf port {type string { pattern "
868+
"'[\\p{IsSpecials}]+';}}");
869+
UTEST_ADD_MODULE(schema, LYS_IN_YANG, NULL, NULL);
870+
TEST_SUCCESS_XML("T_UB_9", "&#xFFFA;&#xFFFD;", STRING, "\xef\xbf\xba\xef\xbf\xbd");
866871
}
867872

868873
static void

0 commit comments

Comments
 (0)