Source code
Revision control
Copy as Markdown
Other Tools
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#ifndef encoding_rs_mem_h_
#define encoding_rs_mem_h_
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
/*
* _Note:_ "Latin1" in this header refers to the Unicode range from U+0000 to
* U+00FF, inclusive, and does not refer to the windows-1252 range. This
* in-memory encoding is sometimes used as a storage optimization of text
* when UTF-16 indexing and length semantics are exposed.
*/
/**
* Classification of text as Latin1 (all code points are below U+0100),
* left-to-right with some non-Latin1 characters or as containing at least
* some right-to-left characters.
*/
typedef enum {
/**
* Every character is below U+0100.
*/
Latin1 = 0,
/**
* There is at least one character that's U+0100 or higher, but there
* are no right-to-left characters.
*/
LeftToRight = 1,
/**
* There is at least one right-to-left character.
*/
Bidi = 2,
} Latin1Bidi;
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
/**
* Checks whether a valid UTF-8 buffer contains code points
* that trigger right-to-left processing or is all-Latin1.
*
* Possibly more efficient than performing the checks separately.
*
* Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
* Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
* `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
* if `buffer` is `NULL`, or if the memory designated by `buffer` and
* `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
* may be bogus but still has to be non-`NULL`.)
*/
Latin1Bidi encoding_mem_check_str_for_latin1_and_bidi(const char* buffer,
size_t len);
/**
* Checks whether a potentially invalid UTF-16 buffer contains code points
* that trigger right-to-left processing or is all-Latin1.
*
* Possibly more efficient than performing the checks separately.
*
* Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
* Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
* `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
Latin1Bidi encoding_mem_check_utf16_for_latin1_and_bidi(const char16_t* buffer,
size_t len);
/**
* Checks whether a potentially invalid UTF-8 buffer contains code points
* that trigger right-to-left processing or is all-Latin1.
*
* Possibly more efficient than performing the checks separately.
*
* Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
*
* Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
* `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL`.)
*/
Latin1Bidi encoding_mem_check_utf8_for_latin1_and_bidi(const char* buffer,
size_t len);
/**
* Converts bytes whose unsigned value is interpreted as Unicode code point
* (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* The number of `char16_t`s written equals the length of the source buffer.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len,
char16_t* dst, size_t dst_len);
/**
* Converts bytes whose unsigned value is interpreted as Unicode code point
* (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
*
* The length of the destination buffer must be at least the length of the
* source buffer times two.
*
* Returns the number of bytes written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Safety
*
* Note that this function may write garbage beyond the number of bytes
* indicated by the return value.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_latin1_to_utf8(const char* src, size_t src_len,
char* dst, size_t dst_len);
/**
* Converts bytes whose unsigned value is interpreted as Unicode code point
* (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
* output space.
*
* Writes the number of code units read into `*src_len` and the number of
* bytes written into `*dst_len`.
*
* If the output isn't large enough, not all input is consumed.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
void encoding_mem_convert_latin1_to_utf8_partial(const char* src,
size_t* src_len, char* dst,
size_t* dst_len);
/**
* Converts valid UTF-8 to valid UTF-16.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of `char16_t`s written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL`, if the two memory blocks overlap, of if the
* buffer designated by `src` and `src_len` does not contain valid UTF-8. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_str_to_utf16(const char* src, size_t src_len,
char16_t* dst, size_t dst_len);
/**
* If the input is valid UTF-16 representing only Unicode code points from
* U+0000 to U+00FF, inclusive, converts the input into output that
* represents the value of each code point as the unsigned byte value of
* each output byte.
*
* If the input does not fulfill the condition stated above, does something
* that is memory-safe without any promises about any properties of the
* output and will probably assert in debug builds in future versions.
* In particular, callers shouldn't assume the output to be the same across
* crate versions or CPU architectures and should not assume that non-ASCII
* input can't map to ASCII output.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* The number of bytes written equals the length of the source buffer.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
* (Probably in future versions if debug assertions are enabled (and not
* fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src,
size_t src_len, char* dst,
size_t dst_len);
/**
* Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
* with the REPLACEMENT CHARACTER.
*
* The length of the destination buffer must be at least the length of the
* source buffer times three.
*
* Returns the number of bytes written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len,
char* dst, size_t dst_len);
/**
* Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
* with the REPLACEMENT CHARACTER with potentially insufficient output
* space.
*
* Writes the number of code units read into `*src_len` and the number of
* bytes written into `*dst_len`.
*
* Guarantees that the bytes in the destination beyond the number of
* bytes claimed as written by the second item of the return tuple
* are left unmodified.
*
* Not all code units are read if there isn't enough output space.
* Note that this method isn't designed for general streamability but for
* not allocating memory for the worst case up front. Specifically,
* if the input starts with or ends with an unpaired surrogate, those are
* replaced with the REPLACEMENT CHARACTER.
*
* Matches the semantics of `TextEncoder.encodeInto()` from the
* Encoding Standard.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
void encoding_mem_convert_utf16_to_utf8_partial(const char16_t* src,
size_t* src_len, char* dst,
size_t* dst_len);
/**
* If the input is valid UTF-8 representing only Unicode code points from
* U+0000 to U+00FF, inclusive, converts the input into output that
* represents the value of each code point as the unsigned byte value of
* each output byte.
*
* If the input does not fulfill the condition stated above, this function
* panics if debug assertions are enabled (and fuzzing isn't) and otherwise
* does something that is memory-safe without any promises about any
* properties of the output. In particular, callers shouldn't assume the
* output to be the same across crate versions or CPU architectures and
* should not assume that non-ASCII input can't map to ASCII output.
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of bytes written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
* If debug assertions are enabled (and not fuzzing) and the input is
* not in the range U+0000 to U+00FF, inclusive.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src,
size_t src_len, char* dst,
size_t dst_len);
/**
* Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
* with the REPLACEMENT CHARACTER.
*
* The length of the destination buffer must be at least the length of the
* source buffer _plus one_.
*
* Returns the number of `char16_t`s written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len,
char16_t* dst, size_t dst_len);
/**
* Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of `char16_t`s written or `SIZE_MAX` if the input was
* invalid.
*
* When the input was invalid, some output may have been written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_convert_utf8_to_utf16_without_replacement(const char* src,
size_t src_len,
char16_t* dst,
size_t dst_len);
/**
* Copies ASCII from source to destination up to the first non-ASCII byte
* (or the end of the input if it is ASCII in its entirety).
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of bytes written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_copy_ascii_to_ascii(const char* src, size_t src_len,
char* dst, size_t dst_len);
/**
* Copies ASCII from source to destination zero-extending it to UTF-16 up to
* the first non-ASCII byte (or the end of the input if it is ASCII in its
* entirety).
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of `char16_t`s written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_copy_ascii_to_basic_latin(const char* src, size_t src_len,
char16_t* dst, size_t dst_len);
/**
* Copies Basic Latin from source to destination narrowing it to ASCII up to
* the first non-Basic Latin code unit (or the end of the input if it is
* Basic Latin in its entirety).
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* Returns the number of bytes written.
*
* # Panics
*
* Panics if the destination buffer is shorter than stated above.
*
* # Undefined behavior
*
* UB ensues if `src` and `src_len` don't designate a valid memory block, if
* `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory
* block, if `dst` is `NULL` or if the two memory blocks overlap. (If
* `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and
* aligned. Likewise for `dst` and `dst_len`.)
*/
size_t encoding_mem_copy_basic_latin_to_ascii(const char16_t* src,
size_t src_len, char* dst,
size_t dst_len);
/**
* Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
void encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t len);
/**
* Checks whether the buffer is all-ASCII.
*
* May read the entire buffer even if it isn't all-ASCII. (I.e. the function
* is not guaranteed to fail fast.)
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL`.)
*/
bool encoding_mem_is_ascii(const char* buffer, size_t len);
/**
* Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
* only ASCII characters).
*
* May read the entire buffer even if it isn't all-ASCII. (I.e. the function
* is not guaranteed to fail fast.)
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
bool encoding_mem_is_basic_latin(const char16_t* buffer, size_t len);
/**
* Checks whether a scalar value triggers right-to-left processing.
*
* The check is done on a Unicode block basis without regard to assigned
* vs. unassigned code points in the block. Hebrew presentation forms in
* the Alphabetic Presentation Forms block are treated as if they formed
* a block on their own (i.e. it treated as right-to-left). Additionally,
* the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
* for. Control characters that are technically bidi controls but do not
* cause right-to-left behavior without the presence of right-to-left
* characters or right-to-left controls are not checked for. As a special
* case, U+FEFF is excluded from Arabic Presentation Forms-B.
*
* # Undefined behavior
*
* Undefined behavior ensues if `c` is not a valid Unicode Scalar Value.
*/
bool encoding_mem_is_char_bidi(char32_t c);
/**
* Checks whether a valid UTF-8 buffer contains code points that trigger
* right-to-left processing.
*
* The check is done on a Unicode block basis without regard to assigned
* vs. unassigned code points in the block. Hebrew presentation forms in
* the Alphabetic Presentation Forms block are treated as if they formed
* a block on their own (i.e. it treated as right-to-left). Additionally,
* the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
* for. Control characters that are technically bidi controls but do not
* cause right-to-left behavior without the presence of right-to-left
* characters or right-to-left controls are not checked for. As a special
* case, U+FEFF is excluded from Arabic Presentation Forms-B.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
* if `buffer` is `NULL`, or if the memory designated by `buffer` and
* `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
* may be bogus but still has to be non-`NULL`.)
*/
bool encoding_mem_is_str_bidi(const char* buffer, size_t len);
/**
* Checks whether the buffer represents only code points less than or equal
* to U+00FF.
*
* Fails fast. (I.e. returns before having read the whole buffer if code
* points above U+00FF are discovered.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
* if `buffer` is `NULL`, or if the memory designated by `buffer` and
* `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer`
* may be bogus but still has to be non-`NULL`.)
*/
bool encoding_mem_is_str_latin1(const char* buffer, size_t len);
/**
* Checks whether a UTF-16 buffer contains code points that trigger
* right-to-left processing.
*
* The check is done on a Unicode block basis without regard to assigned
* vs. unassigned code points in the block. Hebrew presentation forms in
* the Alphabetic Presentation Forms block are treated as if they formed
* a block on their own (i.e. it treated as right-to-left). Additionally,
* the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
* for. Control characters that are technically bidi controls but do not
* cause right-to-left behavior without the presence of right-to-left
* characters or right-to-left controls are not checked for. As a special
* case, U+FEFF is excluded from Arabic Presentation Forms-B.
* Returns `true` if the input contains an RTL character or an unpaired
* high surrogate that could be the high half of an RTL character.
* Returns `false` if the input contains neither RTL characters nor
* unpaired high surrogates that could be higher halves of RTL characters.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
bool encoding_mem_is_utf16_bidi(const char16_t* buffer, size_t len);
/**
* Checks whether a UTF-16 code unit triggers right-to-left processing.
*
* The check is done on a Unicode block basis without regard to assigned
* vs. unassigned code points in the block. Hebrew presentation forms in
* the Alphabetic Presentation Forms block are treated as if they formed
* a block on their own (i.e. it treated as right-to-left). Additionally,
* the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
* for. Control characters that are technically bidi controls but do not
* cause right-to-left behavior without the presence of right-to-left
* characters or right-to-left controls are not checked for. As a special
* case, U+FEFF is excluded from Arabic Presentation Forms-B.
* Since supplementary-plane right-to-left blocks are identifiable from the
* high surrogate without examining the low surrogate, this function returns
* `true` for such high surrogates making the function suitable for handling
* supplementary-plane text without decoding surrogate pairs to scalar
* values. Obviously, such high surrogates are then reported as right-to-left
* even if actually unpaired.
*/
bool encoding_mem_is_utf16_code_unit_bidi(char16_t u);
/**
* Checks whether the buffer represents only code point less than or equal
* to U+00FF.
*
* May read the entire buffer even if it isn't all-Latin1. (I.e. the function
* is not guaranteed to fail fast.)
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
bool encoding_mem_is_utf16_latin1(const char16_t* buffer, size_t len);
/**
* Checks whether a potentially-invalid UTF-8 buffer contains code points
* that trigger right-to-left processing.
*
* The check is done on a Unicode block basis without regard to assigned
* vs. unassigned code points in the block. Hebrew presentation forms in
* the Alphabetic Presentation Forms block are treated as if they formed
* a block on their own (i.e. it treated as right-to-left). Additionally,
* the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
* for. Control characters that are technically bidi controls but do not
* cause right-to-left behavior without the presence of right-to-left
* characters or right-to-left controls are not checked for. As a special
* case, U+FEFF is excluded from Arabic Presentation Forms-B.
* Returns `true` if the input is invalid UTF-8 or the input contains an
* RTL character. Returns `false` if the input is valid UTF-8 and contains
* no RTL characters.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL`.)
*/
bool encoding_mem_is_utf8_bidi(const char* buffer, size_t len);
/**
* Checks whether the buffer is valid UTF-8 representing only code points
* less than or equal to U+00FF.
*
* Fails fast. (I.e. returns before having read the whole buffer if UTF-8
* invalidity or code points above U+00FF are discovered.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL`.)
*/
bool encoding_mem_is_utf8_latin1(const char* buffer, size_t len);
/**
* Returns the index of the first unpaired surrogate or, if the input is
* valid UTF-16 in its entirety, the length of the input.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
size_t encoding_mem_utf16_valid_up_to(const char16_t* buffer, size_t len);
/**
* Returns the index of first byte that starts an invalid byte
* sequence or a non-Latin1 byte sequence, or the length of the
* string if there are neither.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block
* or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but
* still has to be non-`NULL` and aligned.)
*/
size_t encoding_mem_utf8_latin1_up_to(const char* buffer, size_t len);
/**
* Returns the index of first byte that starts a non-Latin1 byte
* sequence, or the length of the string if there are none.
*
* # Undefined behavior
*
* UB ensues if `buffer` and `buffer_len` don't designate a valid memory block,
* if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8.
* (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL`
* and aligned.)
*/
size_t encoding_mem_str_latin1_up_to(const char* buffer, size_t len);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // encoding_rs_mem_h_