Source code
Revision control
Copy as Markdown
Other Tools
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 6b3b2dbe7d..cc7030d1dd 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -499,6 +499,7 @@
return bpe_offsets;
}
+#if 0
// use std::wregex to split the text
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
std::wregex expr(regex_expr);
@@ -528,6 +529,7 @@
return bpe_offsets;
}
+#endif
// use std::regex to split the text
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
@@ -818,20 +820,22 @@
//printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
} else {
- // no unicode category used, we can use std::wregex directly
- const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+ fprintf(stderr, "Only use utf-8");
+ std::abort();
+ // // no unicode category used, we can use std::wregex directly
+ // const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
- // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
- std::wstring wtext(cpts.begin(), cpts.end());
- for (size_t i = 0; i < wtext.size(); ++i) {
- if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
- wtext[i] = 0x0B;
- }
- }
+ // // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
+ // std::wstring wtext(cpts.begin(), cpts.end());
+ // for (size_t i = 0; i < wtext.size(); ++i) {
+ // if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
+ // wtext[i] = 0x0B;
+ // }
+ // }
- //printf("text: %s\n", text.c_str());
- //printf("regex_expr: %s\n", regex_expr.c_str());
- bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
+ // //printf("text: %s\n", text.c_str());
+ // //printf("regex_expr: %s\n", regex_expr.c_str());
+ // bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
}
} catch (std::regex_error & e) {
fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());