[string] Realistic API proposal

Hello All, I'd like to provide a realistic string API proposal: 1. It is Unicode aware extension of std::string 2. It is fully compatible with std::string giving an ability to be included in C++XYZ 3. It can be implemented today! Basically it extends std::string with a) const_code_point_iterator - for iterating over string - bidirectional b) code_point_iterator - back inserter c) provides Unicode and Locale functionality: - normalization - case handling - comparison - search d) marks non const operator[],at() as deprecated Advantages: ----------- 1. It can be implemented easily (when boost.locale is given) 2. It **is** compatible replacement of std::string 3. It allows to use std::string meanwhile under the hood as storage giving high efficiency when assigning boost::string to std::string when the implementation is COW (almost all implementations with exception of MSVC) 4. It is full unicode aware 5. It pushes "UTF-8" idea to standard C++ 6. You don't pay for what you do not need. Proposed API: ------------- namespace boost { // Fully bidirectional iterator template<typename UnitsIterator> class const_code_point_iterator { public: const_code_point_iterator(UnitsIterator begin,UnitsIterator end); // begin const_code_point_iterator(UnitsIterator begin,UnitsIterator end,UnitsIterator location); // current pos const_code_point_iterator(); // end #ifdef C++0x typedef char32_t const_code_point_type; #else typedef unsigned const_code_point_type; #endif const_code_point_type operator*() const; ... }; /// Output iterator template<typename BackInserter> class code_point_iterator { public: code_point_iterator(BackInserter out); // begin code_point_iterator(); // end #ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif code_point_type operator*() const; ... }; template<typename Char,typename Traits=std::char_traits<Char>, typename Alloc=std::allocator<Char> > class basic_string { public: // { boost specific typedef std::basic_string<Char,Traits,Alloc> std_string_type; // } boost specific // All std::string standard functions based // Deprecated interfaces that exist for backward compatibility // as they not Unicode aware value_type &at(size_type indx); value_type &operator[](size_type indx); iterator begin(); iterator end(); // { boost specific compatibility functions with std::string, they would go // as std::string becode extended with boost::string new interfaces // basic_string(std_string_type const &other) : data_(other) {} basic_string(std_string_type const &other,size_type index,size_type len) : data_(other,index,len) {} ... operator std_string_type() const { return data_; } // } boost specific compatibility functions // // Unicode Support // // ------------------------ // // // UTF Codepoint iteration // #ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif typedef boost::const_code_point_iterator<const_iterator> const_code_point_iterator; const_code_point_iterator code_point_begin() const { return const_code_point_iterator(begin(),end()); } const_code_point_iterator code_point_end() const { return const_code_point_iterator(begin(),end(),end()); } typedef boost::code_point_iterator<std::back_inserter<basic_string> > code_point_iterator; code_point_iterator back_inserter() { return code_point_iterator(std::back_inserter<basic_string>(*this)); } basic_string &operator+=(code_point_type code_point); basic_string operator+(code_point_type code_point) const; void append(code_point_type code_point); // // Lexical operations on string // // Case handling basic_string upper_case(std::locale const &l=std::locale()) const; basic_string lower_case(std::locale const &l=std::locale()) const; basic_string title_case(std::locale const &l=std::locale()) const; basic_string fold_case() const; // locale independent // Unicode normalization typedef enum { nfc, nfkc, nfd, nfkd } normalization_mode; basic_string normalize(normalization_mode mode = nfc) const; // normalized string constructor basic_string(basic_string const &,normalization_mode mode); basic_string(Char const *,normalization_mode mode); basic_string(Char const *,size_t n,normalization_mode mode); template<Iterator> basic_string(Iterator begin,Iterator end,normalization_mode mode); void append_normalized(basic_string const &other,normalization_mode mode = nfc); void append_normalized(Char const *,normalization_mode mode = nfc); void append_normalized(Char const *,size_t n,normalization_mode mode = nfc); basic_string concat_normalized(basic_string const &other,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,size_t n,normalization_mode mode = nfc) const; // Unicode validation bool valid_utf() const; typedef struct validate_utf{}; // Unicode validation tag // Create string validating it basic_string(basic_string const &,validate_utf const &); basic_string(Char const *,validate_utf const &); basic_string(Char const *,size_t n,validate_utf const &); template<Iterator> basic_string(Iterator begin,Iterator end,validate_utf const &); // Create string validating and normalazing it basic_string(basic_string const &,validate_utf const &,normalization_mode mode); basic_string(Char const *,validate_utf const &,normalization_mode mode); basic_string(Char const *,size_t n,validate_utf const &,normalization_mode mode); template<Iterator> basic_string(Iterator begin,Iterator end,validate_utf const &,normalization_mode mode); // Search and comparison typedef enum { primary = 0, ///< 1st collation level: base letters secondary = 1, ///< 2nd collation level: letters and accents tertiary = 2, ///< 3rd collation level: letters, accents and case quaternary = 3, ///< 4th collation level: letters, accents, case and punctuation identical = 4 ///< identical collation level: include code-point comparison } level_type; // // search(...) return pair of index and size as string you search for may have different size // std::pair<size_type,size_type> search(basic_string const &other) const; std::pair<size_type,size_type> search(basic_string const &other,level_type level) const; std::pair<size_type,size_type> search(basic_string const &other,std::locale const &l) const ; std::pair<size_type,size_type> search(basic_string const &other,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,level_type level) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,std::locale const &l) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *) const; std::pair<size_type,size_type> search(Char const *,level_type level) const; std::pair<size_type,size_type> search(Char const *,std::locale const &l) const ; std::pair<size_type,size_type> search(Char const *,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *,size_t size) const; std::pair<size_type,size_type> search(Char const *,size_t size,level_type level) const; std::pair<size_type,size_type> search(Char const *,size_t size,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *,size_t size,level_type level,std::locale const &l) const; int compare_to(basic_string const &other) const; int compare_to(basic_string const &other,level_type level) const; int compare_to(basic_string const &other,std::locale const &l) const ; int compare_to(basic_string const &other,level_type level,std::locale const &l) const; int compare_to(basic_string const &other,size_t index,size_t size) const; int compare_to(basic_string const &other,size_t index,size_t size,level_type level) const; int compare_to(basic_string const &other,size_t index,size_t size,std::locale const &l) const; int compare_to(basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other) const; int compare_to(size_t index,size_t size,basic_string const &other,level_type level) const; int compare_to(size_t index,size_t size,basic_string const &other,std::locale const &l) const ; int compare_to(size_t index,size_t size,basic_string const &other,level_type level,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,level_type level) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; int compare_to(Char const *) const; int compare_to(Char const *,level_type level) const; int compare_to(Char const *,std::locale const &l) const ; int compare_to(Char const *,level_type level,std::locale const &l) const; int compare_to(Char const *,size_t size) const; int compare_to(Char const *,size_t size,level_type level) const; int compare_to(Char const *,size_t size,std::locale const &l) const; int compare_to(Char const *,size_t size,level_type level,std::locale const &l) const; int compare_to(size_t size,Char const *) const; int compare_to(size_t size,Char const *,level_type level) const; int compare_to(size_t size,Char const *,std::locale const &l) const ; int compare_to(size_t size,Char const *,level_type level,std::locale const &l) const; int compare_to(size_t size,Char const *,size_t size) const; int compare_to(size_t size,Char const *,size_t size,level_type level) const; int compare_to(size_t size,Char const *,size_t size,std::locale const &l) const; int compare_to(size_t size,Char const *,size_t size,level_type level,std::locale const &l) const; // UTF validation bool is_valid_utf() const; private: std_string_type data_; }; } // boost

On 28.01.2011 11:41, Artyom wrote:
2. It is fully compatible with std::string giving an ability to be included in C++XYZ Actually, I think changing std::string from being "whatever the user put into it" to "UTF-8, definitely" is not a compatible change. Worse, it's a runtime-breaking change, not a compile-time breaking change. Inspecting your proposed interface, I think you mean for std::string to be any data, with the additional functionality only working if is_valid_utf() is true. That sounds rather dangerous in usage, though, since such a requirement is hard to remember. ("Now which member functions can I call on all strings, and which only on valid UTF?")
Advantages: -----------
3. It allows to use std::string meanwhile under the hood as storage giving high efficiency when assigning boost::string to std::string when the implementation is COW (almost all implementations with exception of MSVC) I'm sorry, but COW is on its way out. Conforming C++0x strings are forbidden to use it.
Sebastian

On 28/01/2011 11:41, Artyom wrote:
b) code_point_iterator - back inserter
You could simply define a push_back(char32_t) and have it naturally be called by std::back_inserter.
3. It allows to use std::string meanwhile under the hood as storage giving high efficiency when assigning boost::string to std::string when the implementation is COW (almost all implementations with exception of MSVC)
COW implementations of std::string are not allowed anymore starting with C++0x.
4. It is full unicode aware 5. It pushes "UTF-8" idea to standard C++ 6. You don't pay for what you do not need.
What am I paying for? I don't see how I gain anything.
Proposed API: -------------
namespace boost {
// Fully bidirectional iterator template<typename UnitsIterator> class const_code_point_iterator { public:
const_code_point_iterator(UnitsIterator begin,UnitsIterator end); // begin const_code_point_iterator(UnitsIterator begin,UnitsIterator end,UnitsIterator location); // current pos const_code_point_iterator(); // end
#ifdef C++0x typedef char32_t const_code_point_type; #else typedef unsigned const_code_point_type; #endif
Just define boost::char32 once (depending on BOOST_NO_CHAR32_T) and use that instead of putting ifdefs everywhere. (that's what boost/cuchar.hpp does in my library)
// UTF validation
bool is_valid_utf() const;
See, that's what makes the whole thing pointless. Your type doesn't add any semantic value on top of std::string, it's just an agglomeration of free functions into a class. That's a terrible design. The only advantage that a specific type for unicode strings would bring is that it could enforce certain useful invariants. But your proposal doesn't even enforce the string is valid UTF-8. Enforcing that the string is in a valid UTF encoding and is normalized in a specific normalization form can make most Unicode algorithms several orders of magnitude faster. Since people seem to want this, so here is a simple proposal: template<typename T> struct ustring; where T must be a Forward Sequence of char, char16, char32 or wchar_t. The type then acts as an adaptor over that sequence but enforces that the data is encoded in UTF-X in normalization form C, with X deduced from the value type of the inner Forward Sequence. ustring would be an immutable range of code units, with whatever refinements (bidirectional or random access) the inner Forward Sequence allows. I thought it was accepted that strings should be immutable. Otherwise insertions at the front/back could be added if the underlying forward sequence allows them. Its operator+ would return a lazy join expression. And that's all there is to it. Use free functions for the rest; ustring could provide some member helpers if that really makes life easier for some people. All of this is trivial to implement quickly with my Unicode library.

3. It allows to use std::string meanwhile under the hood as storage giving high efficiency when assigning boost::string to std::string when the implementation is COW (almost all implementations with exception of MSVC)
COW implementations of std::string are not allowed anymore starting with C++0x.
Shame, I still have a little hope that n2668 would be reverted back.
4. It is full unicode aware 5. It pushes "UTF-8" idea to standard C++ 6. You don't pay for what you do not need.
What am I paying for? I don't see how I gain anything.
You don't pay on validation of the UTF-8 especially when 99% of uses of the string are encoding-agnostic.
#ifdef C++0x typedef char32_t const_code_point_type; #else typedef unsigned const_code_point_type; #endif
Just define boost::char32 once (depending on BOOST_NO_CHAR32_T) and use that instead of putting ifdefs everywhere. (that's what boost/cuchar.hpp does in my library)
Good point
// UTF validation
bool is_valid_utf() const;
See, that's what makes the whole thing pointless.
Actually not, consider: socket.read(my_string); if(!my_string.is_valid_utf()) ....
Your type doesn't add any semantic value on top of std::string, it's just an agglomeration of free functions into a class. That's a terrible design. The only advantage that a specific type for unicode strings would bring is that it could enforce certain useful invariants.
You don't need to enforce things you don't care 99% of cases.
Enforcing that the string is in a valid UTF encoding and is normalized in a specific normalization form can make most Unicode algorithms several orders of magnitude faster.
You do not always want to normalize text. It is user choice you may have optimized algorithms for already normalized strings but it is not always the case. Also what kind of normalization NFC? NFKC?
All of this is trivial to implement quickly with my Unicode library.
No, it is not. Your Unicode library is locale agnostic which makes it quite useless in too many cases. Almost every added function was locale sensitive: - search - collation - case handling And so on. This is major drawback of your library that it is not capable of doing locale sensitive algorithms that are vast majority of the Unicode algorithms Artyom

On 28/01/2011 14:58, Artyom wrote:
What am I paying for? I don't see how I gain anything.
You don't pay on validation of the UTF-8 especially when 99% of uses of the string are encoding-agnostic.
I asked for what I gained, not what I did not lose.
// UTF validation
bool is_valid_utf() const;
See, that's what makes the whole thing pointless.
Actually not, consider:
socket.read(my_string); if(!my_string.is_valid_utf()) ....
Could be a free function, and would actually be *better* as a free function, because you could apply it on any range, not just your type.
Your type doesn't add any semantic value on top of std::string, it's just an agglomeration of free functions into a class. That's a terrible design. The only advantage that a specific type for unicode strings would bring is that it could enforce certain useful invariants.
You don't need to enforce things you don't care 99% of cases.
You don't get the point. Your type doesn't add any information on top of std::string. Therefore it is meaningless. It's just an agglomeration of functions, in C++ we use namespaces for that, not classes.
Enforcing that the string is in a valid UTF encoding and is normalized in a specific normalization form can make most Unicode algorithms several orders of magnitude faster.
You do not always want to normalize text. It is user choice you may have optimized algorithms for already normalized strings but it is not always the case.
If my strings are valid and normalized, I can compare them with a simple binary-level comparison; likewise for substring search, where I may also need to add a boundary check if I want fine-grain search. What you want to do is implement comparison by iterating through each lazily computed code point and comparing them. This is at least 60 times as slow; it also doesn't really compare equivalent characters in the strings. To get correct behaviour when comparing strings, they should be normalized. Normalization is costly, so you don't want to do it at each comparison, but only once. In practice, all data available everywhere should already be in NFC (XML mandates it, for example) and checking whether a string is normalized is very fast (while less fast than checking if a string is valid UTF-8, since you still need to access a table, which might hurt the cache, and is not vectorizable). Dealing with potentially invalid UTF strings can be highly dangerous as well, exploits for that kind of thing are common-place. I suspect denormalized Unicode could be sensitive too, since in some parts of your application 00e0 (à) and 0061 0300 (a + `) could compare equal but not in others, depending on what that string went through, causing inconsistencies. Anyway, the only value we can bring on top of the range abstraction is by establishing invariants. It makes sense to establish the strongest one; though I am not opposed to just checking for UTF validity. But no checking at all? There is no point. You might as well make your string type a typedef of std::string.
Also what kind of normalization NFC? NFKC?
NFC, of course. It takes less space and doesn't make you lose anything. If you want to work in decomposed forms or something else, use your own container and not the adaptor. Remember, this whole thing is just there to help you deal with the general case in a practical, correct and efficient way. The real algorithms are fully generic, and allow you to do whatever you want; they accept both normalized and un-normalized strings, data regardless of its memory layout, etc.
All of this is trivial to implement quickly with my Unicode library.
No, it is not.
I know better what I described and what my library is capable of, thank you.
Your Unicode library is locale agnostic which makes it quite useless in too many cases.
In the common case, you don't care (nor want to care) about a locale.
Almost every added function was locale sensitive:
- search - collation - case handling
And so on. This is major drawback of your library that it is not capable of doing locale sensitive algorithms that are vast majority of the Unicode algorithms
Search up to the combining character sequence boundary is locale-agnostic. Search up to the grapheme boundary is virtually locale-agnostic (Unicode does not distribute locale alternatives, though it does hint at its possibility) Case folding only has a couple of characters that are specific for Turkish, making it quite reasonably locale-agnostic. Collation depends on a special table; Unicode only provides a default one, which aims at being as locale-agnostic as possible. It also hosts a repository where one can get alternative tables. Anyway, those are mere details; you can always change the backend for one tailored to your locale.

If my strings are valid and normalized, I can compare them with a simple binary-level comparison; likewise for substring search, where I may also need to add a boundary check if I want fine-grain search.
No you can't For example when you search word שלום you want to find שָלוֹם as well (with diactrics) that are not normalized. Search and Collation require much more complicated levels comparison.
To get correct behaviour when comparing strings, they should be normalized. Normalization is costly, so you don't want to do it at each comparison, but only once. In practice, all data available everywhere should already be in NFC (XML mandates it, for example) and checking whether a string is normalized is very fast (while less fast than checking if a string is valid UTF-8, since you still need to access a table, which might hurt the cache, and is not vectorizable).
Dealing with potentially invalid UTF strings can be highly dangerous as well, exploits for that kind of thing are common-place. I suspect denormalized Unicode could be sensitive too, since in some parts of your application 00e0 (à) and 0061 0300 (a + `) could compare equal but not in others, depending on what that string went through, causing inconsistencies.
The problem that I may want 00e0 (à) and 0061 0300 (a + `) and 0061 (a) to be equal for string search as well. I agree that normalization makes things simpler but in many real world situations it is just not the case. In any case I agree that most of the algorithms may and should be external, but sometimes it is just convenient to have then withing the object.
Anyway, the only value we can bring on top of the range abstraction is by establishing invariants. It makes sense to establish the strongest one; though I am not opposed to just checking for UTF validity.
There are many things to check, checking for valid UTF is just one of the most basic things to do when you get a text from untrusted resource.
All of this is trivial to implement quickly with my Unicode library.
No, it is not.
I know better what I described and what my library is capable of, thank you.
Your Unicode library is locale agnostic which makes it quite useless in too many cases.
In the common case, you don't care (nor want to care) about a locale.
Almost every added function was locale sensitive:
- search - collation - case handling
And so on. This is major drawback of your library that it is not capable of doing locale sensitive algorithms that are vast majority of the Unicode algorithms
Search up to the combining character sequence boundary is locale-agnostic.
I'm talking about several primary collation level that are locale agnostic.
Search up to the grapheme boundary is virtually locale-agnostic (Unicode does not distribute locale alternatives, though it does hint at its possibility)
It provides CLDR - the locales database that has all the tables you need
Case folding only has a couple of characters that are specific for Turkish, making it quite reasonably locale-agnostic.
If I do not mistake case folding is locale agnostic not case mapping which is locale sensitive. And "quite reasonably locale-agnostic" is not the answer for Turkish speaker :-) Same as you can say the text in generally LTR with small exception of Hebrew, Arabic and Persian... So why care?
Collation depends on a special table; Unicode only provides a default one, which aims at being as locale-agnostic as possible. It also hosts a repository where one can get alternative tables.
Any reasonable Unicode library must use them: ICU uses CLDR Windows Unicode API uses CLDR Even CLDR provides tables for Posix API to make it more convinient. So ignoring CLDR is just wrong. CLDR is integrated part of Unicode as its algorithms and character properties database.
Anyway, those are mere details; you can always change the backend for one tailored to your locale.
I'm rather talking about the concept. I do like idea to have full Unicode library in boost, but it should be done right. There are many non-trivial problems with ICU but it still the best library we have around and huge amount of work was put in it. Artyom

On 30/01/2011 08:46, Artyom wrote:
If my strings are valid and normalized, I can compare them with a simple binary-level comparison; likewise for substring search, where I may also need to add a boundary check if I want fine-grain search.
No you can't
For example when you search word שלום you want to find שָלוֹם as well (with diactrics) that are not normalized.
Unless I understand that wrong, they're as equal as e is equal to é or a is equal to à.
Search and Collation require much more complicated levels comparison.
Right, I'm talking about exact comparison, not collation. Exact comparison is what you use in most text processing and parsing. You can perform collation folding with the right level if you want those two strings to compare equal.
The problem that I may want 00e0 (à) and 0061 0300 (a + `) and 0061 (a) to be equal for string search as well.
You may, but that should not be the default behaviour of operator== and operator<.

From: Mathias Gaunard <mathias.gaunard@ens-lyon.org> Subject: Re: [boost] [string] Realistic API proposal
On 30/01/2011 08:46, Artyom wrote:
If my strings are valid and normalized, I can compare them with a simple binary-level comparison; likewise for substring search, where I may also need to add a boundary
check
if I want fine-grain search.
No you can't
For example when you search word שלום you want to find שָלוֹם as well (with diactrics) that are not normalized.
Unless I understand that wrong, they're as equal as e is equal to é or a is equal to à.
Yes, with small exception that "שָ" is NFC form that consists of two code points for "base latter" and "vowel mark" which should be equal to "ש" the "base letter", unlike "à" which has one code point in NFC form like "a".
Search and Collation require much more complicated levels comparison.
Right, I'm talking about exact comparison, not collation. Exact comparison is what you use in most text processing and parsing.
You can perform collation folding with the right level if you want those two strings to compare equal.
The problem that I may want 00e0 (à) and 0061 0300 (a + `) and 0061 (a) to
be
equal for string search as well.
You may, but that should not be the default behaviour of operator== and operator<.
The default behavior is binary comparison, but this is not what I'm looking for I'm looking for search/comparison algorithm that can see "à" and "a" and "שָ" and "ש" as equal. Artyom

Hi Artyom, Artyom wrote:
I'd like to provide a realistic string API proposal:
I've been keeping out of this up to now, but since there is something concrete here I'll share my thoughts.
// Fully bidirectional iterator template<typename UnitsIterator> class const_code_point_iterator { public:
const_code_point_iterator(UnitsIterator begin, UnitsIterator end); // begin const_code_point_iterator(UnitsIterator begin, UnitsIterator end, UnitsIterator location); // current pos const_code_point_iterator(); // end
#ifdef C++0x typedef char32_t const_code_point_type; #else typedef unsigned const_code_point_type; #endif
const_code_point_type operator*() const; ...
};
I have something broadly like this here: http://svn.chezphil.org/libpbe/trunk/include/charset/const_character_iterato... I attempted to do this with the character set as a template parameter and a "charset traits" class providing encoding and decoding functions. That was probably over-complicated; making it utf-8 only would be fine - but in that case, it should have a name that says "utf8". I do find it somewhat unsatisfactory that you need to store the begin and end of the underlying string. This triples the size of what could otherwise be a single pointer. I think these are only needed to detect invalid utf-8, aren't they? In some of my code I had an error_policy template parameter that allowed you to specify whether the input should be trusted or not; if it's trusted you can avoid this overhead. Even then, though, you can't avoid having begin and end in the interface, adding verbosity. Another way to avoid storing begin and end is to somehow make those iterators empty structs (and hence also default-constructable). Specifically, if your underlying string is guaranteed to be null-terminated, the end iterator can be stateless. I guess you could avoid storing the begin iterator by prepending a null, but that doesn't work for std::string.
/// Output iterator template<typename BackInserter> class code_point_iterator { public:
code_point_iterator(BackInserter out); // begin code_point_iterator(); // end
#ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif
code_point_type operator*() const; ...
};
So this only allows appending, right? I have something like that here: http://svn.chezphil.org/libpbe/trunk/include/charset/character_output_iterat... Broadly, I would say that allowing bidirectional reading and append-only writing is the right thing to do for strings. If anyone has an hour to spare, it's educational to try hacking your code to use std::list<char> instead of std::string, and see how much of it still compiles.
template<typename Char,typename Traits=std::char_traits<Char>, typename Alloc=std::allocator<Char> > class basic_string { public: // { boost specific typedef std::basic_string<Char,Traits,Alloc> std_string_type; // } boost specific
// All std::string standard functions based
// Deprecated interfaces that exist for backward compatibility // as they not Unicode aware
value_type &at(size_type indx); value_type &operator[](size_type indx); iterator begin(); iterator end();
// { boost specific compatibility functions with std::string, they would go // as std::string becode extended with boost::string new interfaces // basic_string(std_string_type const &other) : data_(other) {} basic_string(std_string_type const &other,size_type index,size_type len) : data_(other,index,len) {}
...
operator std_string_type() const { return data_; }
// } boost specific compatibility functions
// // Unicode Support // // ------------------------ //
// // UTF Codepoint iteration //
#ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif
typedef boost::const_code_point_iterator<const_iterator> const_code_point_iterator;
const_code_point_iterator code_point_begin() const { return const_code_point_iterator(begin(),end()); } const_code_point_iterator code_point_end() const { return const_code_point_iterator(begin(),end(),end()); }
typedef boost::code_point_iterator<std::back_inserter<basic_string> > code_point_iterator;
code_point_iterator back_inserter() { return code_point_iterator(std::back_inserter<basic_string>(*this)); }
basic_string &operator+=(code_point_type code_point); basic_string operator+(code_point_type code_point) const; void append(code_point_type code_point);
The approach that I would prefer is more like: template <typename impl_t> class utf8_string_adaptor { impl_t impl; .. }; typedef utf8_string_adaptor<std::string> utf8_string; In this way: - I can wrap other containers than std::string, e.g. sgi::rope, char*, std::vector etc. - utf8_string::begin() can return a utf8_character_iterator. - Accessing the underlying bytes is possible but requires something explicit e.g. foo.base().begin().
// // Lexical operations on string //
// Case handling
basic_string upper_case(std::locale const &l=std::locale()) const; basic_string lower_case(std::locale const &l=std::locale()) const; basic_string title_case(std::locale const &l=std::locale()) const; basic_string fold_case() const; // locale independent
// Unicode normalization
typedef enum { nfc, nfkc, nfd, nfkd } normalization_mode;
basic_string normalize(normalization_mode mode = nfc) const;
// normalized string constructor
basic_string(basic_string const &,normalization_mode mode); basic_string(Char const *,normalization_mode mode); basic_string(Char const *,size_t n,normalization_mode mode); template<Iterator> basic_string(Iterator begin,Iterator end,normalization_mode mode);
void append_normalized(basic_string const &other,normalization_mode mode = nfc); void append_normalized(Char const *,normalization_mode mode = nfc); void append_normalized(Char const *,size_t n,normalization_mode mode = nfc);
basic_string concat_normalized(basic_string const &other,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,size_t n,normalization_mode mode = nfc) const;
// Unicode validation
bool valid_utf() const;
[snip] Surely almost all of that should be in free functions and generic algorithms, no? E.g. valid_utf8() could be an algorithm that takes a pair of iterators over bytes, and then it can be used on any sequence. Regards, Phil.
participants (4)
-
Artyom
-
Mathias Gaunard
-
Phil Endecott
-
Sebastian Redl