The m17n Library 1.8.4
|
M-text objects and API for them. More...
Enumerations | |
enum | MTextFormat { MTEXT_FORMAT_US_ASCII , MTEXT_FORMAT_UTF_8 , MTEXT_FORMAT_UTF_16LE , MTEXT_FORMAT_UTF_16BE , MTEXT_FORMAT_UTF_32LE , MTEXT_FORMAT_UTF_32BE , MTEXT_FORMAT_MAX } |
Enumeration for specifying the format of an M-text. More... | |
enum | MTextLineBreakOption { MTEXT_LBO_SP_CM = 1 , MTEXT_LBO_KOREAN_SP = 2 , MTEXT_LBO_AI_AS_ID = 4 , MTEXT_LBO_MAX } |
Enumeration for specifying a set of line breaking option. More... | |
Functions | |
int | mtext_line_break (MText *mt, int pos, int option, int *after) |
Find a linebreak postion of an M-text. | |
MText * | mtext () |
Allocate a new M-text. | |
MText * | mtext_from_data (const void *data, int nitems, enum MTextFormat format) |
Allocate a new M-text with specified data. | |
void * | mtext_data (MText *mt, enum MTextFormat *fmt, int *nunits, int *pos_idx, int *unit_idx) |
Get information about the text data in M-text. | |
int | mtext_len (MText *mt) |
Number of characters in M-text. | |
int | mtext_ref_char (MText *mt, int pos) |
Return the character at the specified position in an M-text. | |
int | mtext_set_char (MText *mt, int pos, int c) |
Store a character into an M-text. | |
MText * | mtext_cat_char (MText *mt, int c) |
Append a character to an M-text. | |
MText * | mtext_dup (MText *mt) |
Create a copy of an M-text. | |
MText * | mtext_cat (MText *mt1, MText *mt2) |
Append an M-text to another. | |
MText * | mtext_ncat (MText *mt1, MText *mt2, int n) |
Append a part of an M-text to another. | |
MText * | mtext_cpy (MText *mt1, MText *mt2) |
Copy an M-text to another. | |
MText * | mtext_ncpy (MText *mt1, MText *mt2, int n) |
Copy the first some characters in an M-text to another. | |
MText * | mtext_duplicate (MText *mt, int from, int to) |
Create a new M-text from a part of an existing M-text. | |
MText * | mtext_copy (MText *mt1, int pos, MText *mt2, int from, int to) |
Copy characters in the specified range into an M-text. | |
int | mtext_del (MText *mt, int from, int to) |
Delete characters in the specified range destructively. | |
int | mtext_ins (MText *mt1, int pos, MText *mt2) |
Insert an M-text into another M-text. | |
int | mtext_insert (MText *mt1, int pos, MText *mt2, int from, int to) |
Insert sub-text of an M-text into another M-text. | |
int | mtext_ins_char (MText *mt, int pos, int c, int n) |
Insert a character into an M-text. | |
int | mtext_replace (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) |
Replace sub-text of M-text with another. | |
int | mtext_character (MText *mt, int from, int to, int c) |
Search a character in an M-text. | |
int | mtext_chr (MText *mt, int c) |
Return the position of the first occurrence of a character in an M-text. | |
int | mtext_rchr (MText *mt, int c) |
Return the position of the last occurrence of a character in an M-text. | |
int | mtext_cmp (MText *mt1, MText *mt2) |
Compare two M-texts character-by-character. | |
int | mtext_ncmp (MText *mt1, MText *mt2, int n) |
Compare initial parts of two M-texts character-by-character. | |
int | mtext_compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) |
Compare specified regions of two M-texts. | |
int | mtext_spn (MText *mt, MText *accept) |
Search an M-text for a set of characters. | |
int | mtext_cspn (MText *mt, MText *reject) |
Search an M-text for the complement of a set of characters. | |
int | mtext_pbrk (MText *mt, MText *accept) |
Search an M-text for any of a set of characters. | |
MText * | mtext_tok (MText *mt, MText *delim, int *pos) |
Look for a token in an M-text. | |
int | mtext_text (MText *mt1, int pos, MText *mt2) |
Locate an M-text in another. | |
int | mtext_search (MText *mt1, int from, int to, MText *mt2) |
Locate an M-text in a specific range of another. | |
int | mtext_casecmp (MText *mt1, MText *mt2) |
Compare two M-texts ignoring cases. | |
int | mtext_ncasecmp (MText *mt1, MText *mt2, int n) |
Compare initial parts of two M-texts ignoring cases. | |
int | mtext_case_compare (MText *mt1, int from1, int to1, MText *mt2, int from2, int to2) |
Compare specified regions of two M-texts ignoring cases. | |
int | mtext_lowercase (MText *mt) |
Lowercase an M-text. | |
int | mtext_titlecase (MText *mt) |
Titlecase an M-text. | |
int | mtext_uppercase (MText *mt) |
Uppercase an M-text. | |
Variables | |
MSymbol | Mlanguage |
Variables: Default Endian of UTF-16 and UTF-32 | |
enum MTextFormat | MTEXT_FORMAT_UTF_16 |
Variable of value MTEXT_FORMAT_UTF_16LE or MTEXT_FORMAT_UTF_16BE. | |
const int | MTEXT_FORMAT_UTF_32 |
Variable of value MTEXT_FORMAT_UTF_32LE or MTEXT_FORMAT_UTF_32BE. | |
M-text objects and API for them.
In the m17n library, text is represented as an object called M-text rather than as a C-string (char *
or unsigned char *
). An M-text is a sequence of characters whose length is equals to or more than 0, and can be coined from various character sources, e.g. C-strings, files, character codes, etc.
M-texts are more useful than C-strings in the following points.
In addition, the library provides many functions to manipulate an M-text just the same way as a C-string.
enum MTextFormat |
Enumeration for specifying the format of an M-text.
The enum MTextFormat is used as an argument of the mtext_from_data() function to specify the format of data from which an M-text is created.
enum MTextLineBreakOption |
Enumeration for specifying a set of line breaking option.
The enum MTextLineBreakOption is to control the line breaking algorithm of the function mtext_line_break() by specifying logical-or of the members in the arg option.
int mtext_line_break | ( | MText * | mt, |
int | pos, | ||
int | option, | ||
int * | after | ||
) |
Find a linebreak postion of an M-text.
The mtext_line_break() function checks if position pos is a proper linebreak position of an M-text mt according to the algorithm of The Unicode Standard 4.0 UAX#14. It so, it returns pos. Otherwise, it returns a proper linebreak position before pos.
If option is nonzero, it controls the algorithm by logical-or of the members of MTextLineBreakOption.
If after is not NULL, a proper linebreak position after pos is stored there.
MText * mtext | ( | ) |
Allocate a new M-text.
The mtext() function allocates a new M-text of length 0 and returns a pointer to it. The allocated M-text will not be freed unless the user explicitly does so with the m17n_object_unref() function.
MText * mtext_from_data | ( | const void * | data, |
int | nitems, | ||
enum MTextFormat | format | ||
) |
Allocate a new M-text with specified data.
The mtext_from_data() function allocates a new M-text whose character sequence is specified by array data of nitems elements. format specifies the format of data.
When format is either MTEXT_FORMAT_US_ASCII or MTEXT_FORMAT_UTF_8, the contents of data must be of the type unsigned
char
, and nitems counts by byte.
When format is either MTEXT_FORMAT_UTF_16LE or MTEXT_FORMAT_UTF_16BE, the contents of data must be of the type unsigned
short
, and nitems counts by unsigned short.
When format is either MTEXT_FORMAT_UTF_32LE or MTEXT_FORMAT_UTF_32BE, the contents of data must be of the type unsigned
, and nitems counts by unsigned.
The character sequence of the M-text is not modifiable.
The contents of data must not be modified while the M-text is alive.
The allocated M-text will not be freed unless the user explicitly does so with the m17n_object_unref() function. Even in that case, data is not freed.
NULL
and assigns an error code to the external variable merror_code. MERROR_MTEXT
void * mtext_data | ( | MText * | mt, |
enum MTextFormat * | fmt, | ||
int * | nunits, | ||
int * | pos_idx, | ||
int * | unit_idx | ||
) |
Get information about the text data in M-text.
The mtext_data() function returns a pointer to the text data of M-text mt. If fmt is not NULL, the format of the text data is stored in it. If nunits is not NULL, the number of units of the text data is stored in it.
If pos_idx is not NULL and it points to a non-negative number, what it points to is a character position. In this case, the return value is a pointer to the text data of a character at that position.
Otherwise, if unit_idx is not NULL, it points to a unit position. In this case, the return value is a pointer to the text data of a character containing that unit.
The character position and unit position of the return value are stored in pos_idx and unit_idx respectively if they are not NULL.
If the format of the text data is MTEXT_FORMAT_US_ASCII or MTEXT_FORMAT_UTF_8, one unit is unsigned char.
If the format is MTEXT_FORMAT_UTF_16LE or MTEXT_FORMAT_UTF_16BE, one unit is unsigned short.
If the format is MTEXT_FORMAT_UTF_32LE or MTEXT_FORMAT_UTF_32BE, one unit is unsigned int.
int mtext_len | ( | MText * | mt | ) |
Number of characters in M-text.
The mtext_len() function returns the number of characters in M-text mt.
int mtext_ref_char | ( | MText * | mt, |
int | pos | ||
) |
Return the character at the specified position in an M-text.
The mtext_ref_char() function returns the character at pos in M-text mt. If an error is detected, it returns -1 and assigns an error code to the external variable merror_code.
MERROR_RANGE
int mtext_set_char | ( | MText * | mt, |
int | pos, | ||
int | c | ||
) |
Store a character into an M-text.
The mtext_set_char() function sets character c, which has no text properties, at pos in M-text mt.
MERROR_RANGE
Append a character to an M-text.
The mtext_cat_char() function appends character c, which has no text properties, to the end of M-text mt.
NULL
. Create a copy of an M-text.
The mtext_dup() function creates a copy of M-text mt while inheriting all the text properties of mt.
Append an M-text to another.
The mtext_cat() function appends M-text mt2 to the end of M-text mt1 while inheriting all the text properties. mt2 itself is not modified.
Append a part of an M-text to another.
The mtext_ncat() function appends the first n characters of M-text mt2 to the end of M-text mt1 while inheriting all the text properties. If the length of mt2 is less than n, all characters are copied. mt2 is not modified.
NULL
and assigns an error code to the global variable merror_code. MERROR_RANGE
Copy an M-text to another.
The mtext_cpy() function copies M-text mt2 to M-text mt1 while inheriting all the text properties. The old text in mt1 is overwritten and the length of mt1 is extended if necessary. mt2 is not modified.
Copy the first some characters in an M-text to another.
The mtext_ncpy() function copies the first n characters of M-text mt2 to M-text mt1 while inheriting all the text properties. If the length of mt2 is less than n, all characters of mt2 are copied. The old text in mt1 is overwritten and the length of mt1 is extended if necessary. mt2 is not modified.
NULL
and assigns an error code to the global variable merror_code. MERROR_RANGE
Create a new M-text from a part of an existing M-text.
The mtext_duplicate() function creates a copy of sub-text of M-text mt, starting at from (inclusive) and ending at to (exclusive) while inheriting all the text properties of mt. mt itself is not modified.
MERROR_RANGE
Copy characters in the specified range into an M-text.
The mtext_copy() function copies the text between from (inclusive) and to (exclusive) in M-text mt2 to the region starting at pos in M-text mt1 while inheriting the text properties. The old text in mt1 is overwritten and the length of mt1 is extended if necessary. mt2 is not modified.
NULL
and assigns an error code to the external variable merror_code. MERROR_RANGE
int mtext_del | ( | MText * | mt, |
int | from, | ||
int | to | ||
) |
Delete characters in the specified range destructively.
The mtext_del() function deletes the characters in the range from (inclusive) and to (exclusive) from M-text mt destructively. As a result, the length of mt shrinks by (to - from) characters.
MERROR_RANGE
Insert an M-text into another M-text.
The mtext_ins() function inserts M-text mt2 into M-text mt1, at position pos. As a result, mt1 is lengthen by the length of mt2. On insertion, all the text properties of mt2 are inherited. The original mt2 is not modified.
MERROR_RANGE
, MERROR_MTEXT
Insert sub-text of an M-text into another M-text.
The mtext_insert() function inserts sub-text of M-text mt2 between from (inclusive) and to (exclusive) into M-text mt1, at position pos. As a result, mt1 is lengthen by (to - from). On insertion, all the text properties of the sub-text of mt2 are inherited.
MERROR_MTEXT
, MERROR_RANGE
int mtext_ins_char | ( | MText * | mt, |
int | pos, | ||
int | c, | ||
int | n | ||
) |
Insert a character into an M-text.
The mtext_ins_char() function inserts n copies of character c into M-text mt at position pos. As a result, mt is lengthen by n.
MERROR_RANGE
Replace sub-text of M-text with another.
The mtext_replace() function replaces sub-text of M-text mt1 between from1 (inclusive) and to1 (exclusive) with the sub-text of M-text mt2 between from2 (inclusive) and to2 (exclusive). The new sub-text inherits text properties of the old sub-text.
MERROR_MTEXT
, MERROR_RANGE
int mtext_character | ( | MText * | mt, |
int | from, | ||
int | to, | ||
int | c | ||
) |
Search a character in an M-text.
The mtext_character() function searches M-text mt for character c. If from is less than to, the search begins at position from and goes forward but does not exceed (to - 1). Otherwise, the search begins at position (from - 1) and goes backward but does not exceed to. An invalid position specification is regarded as both from and to being 0.
int mtext_chr | ( | MText * | mt, |
int | c | ||
) |
Return the position of the first occurrence of a character in an M-text.
The mtext_chr() function searches M-text mt for character c. The search starts from the beginning of mt and goes toward the end.
MERROR_RANGE
int mtext_rchr | ( | MText * | mt, |
int | c | ||
) |
Return the position of the last occurrence of a character in an M-text.
The mtext_rchr() function searches M-text mt for character c. The search starts from the end of mt and goes backwardly toward the beginning.
MERROR_RANGE
Compare two M-texts character-by-character.
The mtext_cmp() function compares M-texts mt1 and mt2 character by character.
Compare initial parts of two M-texts character-by-character.
The mtext_ncmp() function is similar to mtext_cmp(), but compares at most n characters from the beginning.
Compare specified regions of two M-texts.
The mtext_compare() function compares two M-texts mt1 and mt2, character-by-character. The compared regions are between from1 and to1 in mt1 and from2 to to2 in MT2. from1 and from2 are inclusive, to1 and to2 are exclusive. from1 being equal to to1 (or from2 being equal to to2) means an M-text of length zero. An invalid region specification is regarded as both from1 and to1 (or from2 and to2) being 0.
Search an M-text for a set of characters.
The mtext_spn() function returns the length of the initial segment of M-text mt1 that consists entirely of characters in M-text mt2.
Search an M-text for the complement of a set of characters.
The mtext_cspn() returns the length of the initial segment of M-text mt1 that consists entirely of characters not in M-text mt2.
Search an M-text for any of a set of characters.
The mtext_pbrk() function locates the first occurrence in M-text mt1 of any of the characters in M-text mt2.
Look for a token in an M-text.
The mtext_tok() function searches a token that firstly occurs after position pos in M-text mt. Here, a token means a substring each of which does not appear in M-text delim. Note that the type of pos is not int
but pointer to int
.
NULL
without changing the external variable merror_code. If an error is detected, it returns NULL
and assigns an error code to the external variable merror_code. MERROR_RANGE
Locate an M-text in another.
The mtext_text() function finds the first occurrence of M-text mt2 in M-text mt1 after the position pos while ignoring difference of the text properties.
Locate an M-text in a specific range of another.
The mtext_search() function searches for the first occurrence of M-text mt2 in M-text mt1 in the region from and to while ignoring difference of the text properties. If from is less than to, the forward search starts from from, otherwise the backward search starts from to.
Compare two M-texts ignoring cases.
The mtext_casecmp() function is similar to mtext_cmp(), but ignores cases on comparison.
Compare initial parts of two M-texts ignoring cases.
The mtext_ncasecmp() function is similar to mtext_casecmp(), but compares at most n characters from the beginning.
Compare specified regions of two M-texts ignoring cases.
The mtext_case_compare() function compares two M-texts mt1 and mt2, character-by-character, ignoring cases. The compared regions are between from1 and to1 in mt1 and from2 to to2 in MT2. from1 and from2 are inclusive, to1 and to2 are exclusive. from1 being equal to to1 (or from2 being equal to to2) means an M-text of length zero. An invalid region specification is regarded as both from1 and to1 (or from2 and to2) being 0.
int mtext_lowercase | ( | MText * | mt | ) |
Lowercase an M-text.
The mtext_lowercase() function destructively converts each character in M-text mt to lowercase. Adjacent characters in mt may affect the case conversion. If the Mlanguage text property is attached to mt, it may also affect the conversion. The length of mt may change. Characters that cannot be converted to lowercase is left unchanged. All the text properties are inherited.
int mtext_titlecase | ( | MText * | mt | ) |
Titlecase an M-text.
The mtext_titlecase() function destructively converts the first character with the cased property in M-text mt to titlecase and the others to lowercase. The length of mt may change. If the character cannot be converted to titlecase, it is left unchanged. All the text properties are inherited.
int mtext_uppercase | ( | MText * | mt | ) |
Uppercase an M-text.
The mtext_uppercase() function destructively converts each character in M-text mt to uppercase. Adjacent characters in mt may affect the case conversion. If the Mlanguage text property is attached to mt, it may also affect the conversion. The length of mt may change. Characters that cannot be converted to uppercase is left unchanged. All the text properties are inherited.
|
extern |
Variable of value MTEXT_FORMAT_UTF_16LE or MTEXT_FORMAT_UTF_16BE.
The global variable MTEXT_FORMAT_UTF_16 is initialized to MTEXT_FORMAT_UTF_16LE on a "Little Endian" system (storing words with the least significant byte first), and to MTEXT_FORMAT_UTF_16BE on a "Big Endian" system (storing words with the most significant byte first).
|
extern |
Variable of value MTEXT_FORMAT_UTF_32LE or MTEXT_FORMAT_UTF_32BE.
The global variable MTEXT_FORMAT_UTF_32 is initialized to MTEXT_FORMAT_UTF_32LE on a "Little Endian" system (storing words with the least significant byte first), and to MTEXT_FORMAT_UTF_32BE on a "Big Endian" system (storing words with the most significant byte first).
MSymbol Mlanguage |
The symbol whose name is "language".