[string] Realistic API proposal

28 Jan 2011

      Hello All,

I'd like to provide a realistic string API proposal:

1. It is Unicode aware extension of std::string
2. It is fully compatible with std::string giving
   an ability to be included in C++XYZ
3. It can be implemented today!

Basically it extends std::string with

  a) const_code_point_iterator - for iterating over string - bidirectional
  b) code_point_iterator - back inserter 
  c) provides Unicode and Locale functionality:
     - normalization
     - case handling
     - comparison 
     - search
  d) marks non const operator[],at() as deprecated

Advantages:
-----------

1. It can be implemented easily (when boost.locale is given)
2. It **is** compatible replacement of std::string
3. It allows to use std::string meanwhile under the hood as storage
   giving high efficiency when assigning boost::string to std::string
   when the implementation is COW (almost all implementations with
   exception of MSVC)
4. It is full unicode aware
5. It pushes "UTF-8" idea to standard C++
6. You don't pay for what you do not need.

Proposed API:
-------------

namespace boost {

    // Fully bidirectional iterator
    template<typename UnitsIterator>
    class const_code_point_iterator {
    public:

        const_code_point_iterator(UnitsIterator begin,UnitsIterator end); // 
begin
        const_code_point_iterator(UnitsIterator begin,UnitsIterator 
end,UnitsIterator location); // current pos
        const_code_point_iterator(); // end

        #ifdef C++0x
        typedef char32_t const_code_point_type;
        #else
        typedef unsigned const_code_point_type;
        #endif

        const_code_point_type operator*() const;
        ...

    };

    /// Output iterator
    template<typename BackInserter>
    class code_point_iterator {
    public:

        code_point_iterator(BackInserter out); // begin
        code_point_iterator(); // end

        #ifdef C++0x
        typedef char32_t code_point_type;
        #else
        typedef unsigned code_point_type;
        #endif

        code_point_type operator*() const;
        ...

    };

    template<typename Char,typename Traits=std::char_traits<Char>, typename 
Alloc=std::allocator<Char> >
    class basic_string {
    public:
        // { boost specific
        typedef std::basic_string<Char,Traits,Alloc> std_string_type;
        // } boost specific

        // All std::string standard functions based 

        // Deprecated interfaces that exist for backward compatibility
        // as they not Unicode aware

        value_type &at(size_type indx);
        value_type &operator[](size_type indx);
        iterator begin();
        iterator end();

        // { boost specific compatibility functions with std::string, they would 
go
        //   as std::string becode extended with boost::string new interfaces
        //
        basic_string(std_string_type const &other) : data_(other) {}
        basic_string(std_string_type const &other,size_type index,size_type len) 
: data_(other,index,len) {}

        ...

        operator std_string_type() const
        {
            return data_;
        }

        // } boost specific compatibility functions

        //
        // Unicode Support
        //
        // ------------------------
        //

        //
        // UTF Codepoint iteration
        //

        #ifdef C++0x
        typedef char32_t code_point_type;
        #else
        typedef unsigned code_point_type;
        #endif

        typedef boost::const_code_point_iterator<const_iterator> 
const_code_point_iterator;

        const_code_point_iterator code_point_begin() const
        {
            return const_code_point_iterator(begin(),end());
        }
        const_code_point_iterator code_point_end() const
        {
            return const_code_point_iterator(begin(),end(),end());
        }

        typedef boost::code_point_iterator<std::back_inserter<basic_string> > 
code_point_iterator;

        code_point_iterator back_inserter()
        {
            return code_point_iterator(std::back_inserter<basic_string>(*this));
        }

        basic_string &operator+=(code_point_type code_point);
        basic_string operator+(code_point_type code_point) const;
        void append(code_point_type code_point);

        //
        // Lexical operations on string
        //

        // Case handling

        basic_string upper_case(std::locale const &l=std::locale()) const;
        basic_string lower_case(std::locale const &l=std::locale()) const;
        basic_string title_case(std::locale const &l=std::locale()) const;
        basic_string fold_case() const; // locale independent

        // Unicode normalization

        typedef enum {
            nfc,
            nfkc,
            nfd,
            nfkd
        } normalization_mode;

        basic_string normalize(normalization_mode mode = nfc) const;

        // normalized string constructor

        basic_string(basic_string const &,normalization_mode mode);
        basic_string(Char const *,normalization_mode mode);
        basic_string(Char const *,size_t n,normalization_mode mode);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,normalization_mode mode);

        void append_normalized(basic_string const &other,normalization_mode mode 
= nfc);
        void append_normalized(Char const *,normalization_mode mode = nfc);
        void append_normalized(Char const *,size_t n,normalization_mode mode = 
nfc);

        basic_string concat_normalized(basic_string const 
&other,normalization_mode mode = nfc) const;
        basic_string concat_normalized(Char const *,normalization_mode mode = 
nfc) const;
        basic_string concat_normalized(Char const *,size_t n,normalization_mode 
mode = nfc) const;

        // Unicode validation

        bool valid_utf() const;

        typedef struct validate_utf{}; // Unicode validation tag

        // Create string validating it
        basic_string(basic_string const &,validate_utf const &);
        basic_string(Char const *,validate_utf const &);
        basic_string(Char const *,size_t n,validate_utf const &);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,validate_utf const &);

        // Create string validating and normalazing it

        basic_string(basic_string const &,validate_utf const 
&,normalization_mode mode);
        basic_string(Char const *,validate_utf const &,normalization_mode mode);
        basic_string(Char const *,size_t n,validate_utf const 
&,normalization_mode mode);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,validate_utf const 
&,normalization_mode mode);

        // Search and comparison

        typedef enum {
            primary     = 0, ///< 1st collation level: base letters
            secondary   = 1, ///< 2nd collation level: letters and accents
            tertiary    = 2, ///< 3rd collation level: letters, accents and case
            quaternary  = 3, ///< 4th collation level: letters, accents, case 
and punctuation
            identical   = 4  ///< identical collation level: include code-point 
comparison
        } level_type;

        //
        // search(...) return pair of index and size as string you search for 
may have different size
        //
        std::pair<size_type,size_type> search(basic_string const &other) const;
        std::pair<size_type,size_type> search(basic_string const 
&other,level_type level) const;
        std::pair<size_type,size_type> search(basic_string const 
&other,std::locale const &l) const ;
        std::pair<size_type,size_type> search(basic_string const 
&other,level_type level,std::locale const &l) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t 
index,size_t size) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t 
index,size_t size,level_type level) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t 
index,size_t size,std::locale const &l) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t 
index,size_t size,level_type level,std::locale const &l) const;

        std::pair<size_type,size_type> search(Char const *) const;
        std::pair<size_type,size_type> search(Char const *,level_type level) 
const;
        std::pair<size_type,size_type> search(Char const *,std::locale const &l) 
const ;
        std::pair<size_type,size_type> search(Char const *,level_type 
level,std::locale const &l) const;
        std::pair<size_type,size_type> search(Char const *,size_t size) const;
        std::pair<size_type,size_type> search(Char const *,size_t 
size,level_type level) const;
        std::pair<size_type,size_type> search(Char const *,size_t 
size,std::locale const &l) const;
        std::pair<size_type,size_type> search(Char const *,size_t 
size,level_type level,std::locale const &l) const;

        int compare_to(basic_string const &other) const;
        int compare_to(basic_string const &other,level_type level) const;
        int compare_to(basic_string const &other,std::locale const &l) const ;
        int compare_to(basic_string const &other,level_type level,std::locale 
const &l) const;
        int compare_to(basic_string const &other,size_t index,size_t size) 
const;
        int compare_to(basic_string const &other,size_t index,size_t 
size,level_type level) const;
        int compare_to(basic_string const &other,size_t index,size_t 
size,std::locale const &l) const;
        int compare_to(basic_string const &other,size_t index,size_t 
size,level_type level,std::locale const &l) const;

        int compare_to(size_t index,size_t size,basic_string const &other) 
const;
        int compare_to(size_t index,size_t size,basic_string const 
&other,level_type level) const;
        int compare_to(size_t index,size_t size,basic_string const 
&other,std::locale const &l) const ;
        int compare_to(size_t index,size_t size,basic_string const 
&other,level_type level,std::locale const &l) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t 
index,size_t size) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t 
index,size_t size,level_type level) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t 
index,size_t size,std::locale const &l) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t 
index,size_t size,level_type level,std::locale const &l) const;

        int compare_to(Char const *) const;
        int compare_to(Char const *,level_type level) const;
        int compare_to(Char const *,std::locale const &l) const ;
        int compare_to(Char const *,level_type level,std::locale const &l) 
const;
        int compare_to(Char const *,size_t size) const;
        int compare_to(Char const *,size_t size,level_type level) const;
        int compare_to(Char const *,size_t size,std::locale const &l) const;
        int compare_to(Char const *,size_t size,level_type level,std::locale 
const &l) const;

        int compare_to(size_t size,Char const *) const;
        int compare_to(size_t size,Char const *,level_type level) const;
        int compare_to(size_t size,Char const *,std::locale const &l) const ;
        int compare_to(size_t size,Char const *,level_type level,std::locale 
const &l) const;
        int compare_to(size_t size,Char const *,size_t size) const;
        int compare_to(size_t size,Char const *,size_t size,level_type level) 
const;
        int compare_to(size_t size,Char const *,size_t size,std::locale const 
&l) const;
        int compare_to(size_t size,Char const *,size_t size,level_type 
level,std::locale const &l) const;

        // UTF validation

        bool is_valid_utf() const;

    private:
        std_string_type data_;
    };

} // boost

Artyom

Sebastian Redl

Mathias Gaunard

Artyom

Mathias Gaunard

Artyom

Mathias Gaunard

Artyom

Phil Endecott

tags

participants (4)