
Hello All, I'd like to provide a realistic string API proposal: 1. It is Unicode aware extension of std::string 2. It is fully compatible with std::string giving an ability to be included in C++XYZ 3. It can be implemented today! Basically it extends std::string with a) const_code_point_iterator - for iterating over string - bidirectional b) code_point_iterator - back inserter c) provides Unicode and Locale functionality: - normalization - case handling - comparison - search d) marks non const operator[],at() as deprecated Advantages: ----------- 1. It can be implemented easily (when boost.locale is given) 2. It **is** compatible replacement of std::string 3. It allows to use std::string meanwhile under the hood as storage giving high efficiency when assigning boost::string to std::string when the implementation is COW (almost all implementations with exception of MSVC) 4. It is full unicode aware 5. It pushes "UTF-8" idea to standard C++ 6. You don't pay for what you do not need. Proposed API: ------------- namespace boost { // Fully bidirectional iterator template<typename UnitsIterator> class const_code_point_iterator { public: const_code_point_iterator(UnitsIterator begin,UnitsIterator end); // begin const_code_point_iterator(UnitsIterator begin,UnitsIterator end,UnitsIterator location); // current pos const_code_point_iterator(); // end #ifdef C++0x typedef char32_t const_code_point_type; #else typedef unsigned const_code_point_type; #endif const_code_point_type operator*() const; ... }; /// Output iterator template<typename BackInserter> class code_point_iterator { public: code_point_iterator(BackInserter out); // begin code_point_iterator(); // end #ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif code_point_type operator*() const; ... }; template<typename Char,typename Traits=std::char_traits<Char>, typename Alloc=std::allocator<Char> > class basic_string { public: // { boost specific typedef std::basic_string<Char,Traits,Alloc> std_string_type; // } boost specific // All std::string standard functions based // Deprecated interfaces that exist for backward compatibility // as they not Unicode aware value_type &at(size_type indx); value_type &operator[](size_type indx); iterator begin(); iterator end(); // { boost specific compatibility functions with std::string, they would go // as std::string becode extended with boost::string new interfaces // basic_string(std_string_type const &other) : data_(other) {} basic_string(std_string_type const &other,size_type index,size_type len) : data_(other,index,len) {} ... operator std_string_type() const { return data_; } // } boost specific compatibility functions // // Unicode Support // // ------------------------ // // // UTF Codepoint iteration // #ifdef C++0x typedef char32_t code_point_type; #else typedef unsigned code_point_type; #endif typedef boost::const_code_point_iterator<const_iterator> const_code_point_iterator; const_code_point_iterator code_point_begin() const { return const_code_point_iterator(begin(),end()); } const_code_point_iterator code_point_end() const { return const_code_point_iterator(begin(),end(),end()); } typedef boost::code_point_iterator<std::back_inserter<basic_string> > code_point_iterator; code_point_iterator back_inserter() { return code_point_iterator(std::back_inserter<basic_string>(*this)); } basic_string &operator+=(code_point_type code_point); basic_string operator+(code_point_type code_point) const; void append(code_point_type code_point); // // Lexical operations on string // // Case handling basic_string upper_case(std::locale const &l=std::locale()) const; basic_string lower_case(std::locale const &l=std::locale()) const; basic_string title_case(std::locale const &l=std::locale()) const; basic_string fold_case() const; // locale independent // Unicode normalization typedef enum { nfc, nfkc, nfd, nfkd } normalization_mode; basic_string normalize(normalization_mode mode = nfc) const; // normalized string constructor basic_string(basic_string const &,normalization_mode mode); basic_string(Char const *,normalization_mode mode); basic_string(Char const *,size_t n,normalization_mode mode); template<Iterator> basic_string(Iterator begin,Iterator end,normalization_mode mode); void append_normalized(basic_string const &other,normalization_mode mode = nfc); void append_normalized(Char const *,normalization_mode mode = nfc); void append_normalized(Char const *,size_t n,normalization_mode mode = nfc); basic_string concat_normalized(basic_string const &other,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,normalization_mode mode = nfc) const; basic_string concat_normalized(Char const *,size_t n,normalization_mode mode = nfc) const; // Unicode validation bool valid_utf() const; typedef struct validate_utf{}; // Unicode validation tag // Create string validating it basic_string(basic_string const &,validate_utf const &); basic_string(Char const *,validate_utf const &); basic_string(Char const *,size_t n,validate_utf const &); template<Iterator> basic_string(Iterator begin,Iterator end,validate_utf const &); // Create string validating and normalazing it basic_string(basic_string const &,validate_utf const &,normalization_mode mode); basic_string(Char const *,validate_utf const &,normalization_mode mode); basic_string(Char const *,size_t n,validate_utf const &,normalization_mode mode); template<Iterator> basic_string(Iterator begin,Iterator end,validate_utf const &,normalization_mode mode); // Search and comparison typedef enum { primary = 0, ///< 1st collation level: base letters secondary = 1, ///< 2nd collation level: letters and accents tertiary = 2, ///< 3rd collation level: letters, accents and case quaternary = 3, ///< 4th collation level: letters, accents, case and punctuation identical = 4 ///< identical collation level: include code-point comparison } level_type; // // search(...) return pair of index and size as string you search for may have different size // std::pair<size_type,size_type> search(basic_string const &other) const; std::pair<size_type,size_type> search(basic_string const &other,level_type level) const; std::pair<size_type,size_type> search(basic_string const &other,std::locale const &l) const ; std::pair<size_type,size_type> search(basic_string const &other,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,level_type level) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,std::locale const &l) const; std::pair<size_type,size_type> search(basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *) const; std::pair<size_type,size_type> search(Char const *,level_type level) const; std::pair<size_type,size_type> search(Char const *,std::locale const &l) const ; std::pair<size_type,size_type> search(Char const *,level_type level,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *,size_t size) const; std::pair<size_type,size_type> search(Char const *,size_t size,level_type level) const; std::pair<size_type,size_type> search(Char const *,size_t size,std::locale const &l) const; std::pair<size_type,size_type> search(Char const *,size_t size,level_type level,std::locale const &l) const; int compare_to(basic_string const &other) const; int compare_to(basic_string const &other,level_type level) const; int compare_to(basic_string const &other,std::locale const &l) const ; int compare_to(basic_string const &other,level_type level,std::locale const &l) const; int compare_to(basic_string const &other,size_t index,size_t size) const; int compare_to(basic_string const &other,size_t index,size_t size,level_type level) const; int compare_to(basic_string const &other,size_t index,size_t size,std::locale const &l) const; int compare_to(basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other) const; int compare_to(size_t index,size_t size,basic_string const &other,level_type level) const; int compare_to(size_t index,size_t size,basic_string const &other,std::locale const &l) const ; int compare_to(size_t index,size_t size,basic_string const &other,level_type level,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,level_type level) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,std::locale const &l) const; int compare_to(size_t index,size_t size,basic_string const &other,size_t index,size_t size,level_type level,std::locale const &l) const; int compare_to(Char const *) const; int compare_to(Char const *,level_type level) const; int compare_to(Char const *,std::locale const &l) const ; int compare_to(Char const *,level_type level,std::locale const &l) const; int compare_to(Char const *,size_t size) const; int compare_to(Char const *,size_t size,level_type level) const; int compare_to(Char const *,size_t size,std::locale const &l) const; int compare_to(Char const *,size_t size,level_type level,std::locale const &l) const; int compare_to(size_t size,Char const *) const; int compare_to(size_t size,Char const *,level_type level) const; int compare_to(size_t size,Char const *,std::locale const &l) const ; int compare_to(size_t size,Char const *,level_type level,std::locale const &l) const; int compare_to(size_t size,Char const *,size_t size) const; int compare_to(size_t size,Char const *,size_t size,level_type level) const; int compare_to(size_t size,Char const *,size_t size,std::locale const &l) const; int compare_to(size_t size,Char const *,size_t size,level_type level,std::locale const &l) const; // UTF validation bool is_valid_utf() const; private: std_string_type data_; }; } // boost