[string] Yet another Unicode string class

10 Feb 2011

      I'm working on yet another Unicode string class/library from another set 
of features and requirements.

* It is designed around the codepoint concept.
* It uses (currently forward-) iterators for encoding and decoding.
* It has a minimal interface, mostly constructors and iterator access.
* Most other functions can (hopefully) be free functions.
* It uses basic_string as backend.
* It has fast access to underlying basic_string.
* It is (currently) using some C++0X features (mainly decltype).
* It is (currently) immutable and shares data, and thus fast to copy.

Some of these features and requirements may be unacceptable to some of 
you, but I'm open to suggestions and comments.

//  Copyright (c) 2011 Anders Dalvander.
//
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
//   http://www.boost.org/LICENSE_1_0.txt)

template <typename encoding>
class basic_text
{
public:
    typedef encoding encoding_type;
    typedef typename encoding_type::codeunit_type codeunit_type;
    typedef typename encoding_type::codepoint_type codepoint_type;
    typedef std::basic_string<codeunit_type> string_type;
    typedef typename string_type::const_iterator codeunit_iterator;
    typedef typename encoding_type::decode_iterator<codeunit_iterator>
       codepoint_iterator;
    typedef codepoint_iterator const_iterator;
    typedef codepoint_iterator iterator;

    basic_text()
       : s(std::make_shared<string_type>())
    {
    }

    template <typename other_encoding>
    basic_text(const basic_text<other_encoding>& text)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<decltype(std::begin(text))>
       (std::begin(text), std::begin(text), std::end(text)),
       encoding_type::encode_iterator<decltype(std::begin(text))>
       (std::end(text), std::begin(text), std::end(text))))
    {
    }

    // TODO: Use some default_encoding traits type.
    template <typename container>
    explicit basic_text(const container& c)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<decltype(std::begin(c))>
       (std::begin(c), std::begin(c), std::end(c)),
       encoding_type::encode_iterator<decltype(std::begin(c))>
       (std::end(c), std::begin(c), std::end(c))))
    {
    }

    template <typename codepoint_iterator>
    basic_text(codepoint_iterator first, codepoint_iterator last)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<codepoint_iterator>
       (first, first, last),
       encoding_type::encode_iterator<codepoint_iterator>
       (last, first, last)))
    {
    }

    codepoint_iterator begin() const
    {
       return codepoint_iterator
          (codeunit_begin(), codeunit_begin(), codeunit_end());
    }

    codepoint_iterator end() const
    {
       return codepoint_iterator
          (codeunit_end(), codeunit_begin(), codeunit_end());
    }

    codeunit_iterator codeunit_begin() const
    {
       return std::begin(*s);
    }

    codeunit_iterator codeunit_end() const
    {
       return std::end(*s);
    }

    const string_type& str() const
    {
       return *s;
    }

    const codeunit_type* c_str() const
    {
       return s->c_str();
    }

private:
    typedef std::shared_ptr<const string_type> pointer_type;

    pointer_type s;
};

typedef undefined-type utf8_encoding;
typedef basic_text<utf8_encoding> u8text;
typedef undefined-type utf16_encoding;
typedef basic_text<utf16_encoding> u16text;
typedef undefined-type utf32_encoding;
typedef basic_text<utf32_encoding> u32text;
typedef undefined-type wchar_encoding;
typedef basic_text<wchar_encoding> wtext;
typedef undefined-type ascii_encoding;
typedef basic_text<ascii_encoding> ascii_text;

Usage:

int main()
{
    const uint32_t cps[] = {0x41,0x42,0x80,0x800,0x10000,0x10ffff};

    // construct from codepoint range
    u8text u8txt(std::begin(cps), std::end(cps));

    // construct from encoded container,
    // currently treats each element as a codepoint
    u8text u8txt2("test");

    // sharing is caring
    u8text u8txt3 = u8txt;

    // construct from codepoint range
    u16text u16txt(std::begin(cps), std::end(cps));

    // construct from text, transcodes range
    u16text u16txt2 = u8txt;

    // construct from text, transcodes range
    u32text u32txt = u8txt;

    // using policy (possible extension)
    ascii_text ascii(u8txt, replace_policy(0xff));
}

void OpenFileWin32(const u8text& txt)
{
    CloseHandle(CreateFileW(wtext(txt).c_str(), ...))
}

typedef undefined-type posix_encoding;
typedef basic_text<posix_encoding> posixtext;

void OpenFilePosix(const u8text& txt)
{
    close(open(posixtext(txt).c_str(), ...))
}

Regards,
Anders Dalvander

-- 
WWFSMD?

Anders Dalvander

Mathias Gaunard

Anders Dalvander

Dave Abrahams

Anders Dalvander

Mathias Gaunard

Scott McMurray

Stephan T. Lavavej

Sebastian Redl

John Bytheway

Howard Hinnant

Mathias Gaunard

Jeff Flinn

Joel Falcou

Mathias Gaunard

tags

participants (10)