4 files changed, 906 insertions, 0 deletions
diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp
new file mode 100644
index 000000000..daa83184c
--- /dev/null
+++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp
@@ -0,0 +1,295 @@
+// Copyright (C) 2005  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_TOKENIZER_KERNEL_1_CPp_
+#define DLIB_TOKENIZER_KERNEL_1_CPp_
+#include "tokenizer_kernel_1.h"
+
+#include <iostream>
+#include <cstdio>
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    tokenizer_kernel_1::
+    tokenizer_kernel_1 (        
+    ) :
+        headset(0),
+        bodyset(0),
+        have_peeked(false)
+    {
+        try
+        {
+            headset = new bool[UCHAR_MAX];
+            bodyset = new bool[UCHAR_MAX];
+
+            clear();
+        }
+        catch (...)
+        {
+            if (headset) delete [] headset;
+            if (bodyset) delete [] bodyset;
+            throw;
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    tokenizer_kernel_1::
+    ~tokenizer_kernel_1 (
+    )
+    {
+        delete [] bodyset;
+        delete [] headset;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void tokenizer_kernel_1::
+    clear(
+    )
+    {
+        using namespace std;
+
+        in = 0;
+        streambuf = 0;
+        have_peeked = false;
+
+        head = "_" + lowercase_letters() + uppercase_letters();
+        body = "_" + lowercase_letters() + uppercase_letters() + numbers();
+
+        for (unsigned long i = 0; i < UCHAR_MAX; ++i)
+        {
+            headset[i] = false;
+            bodyset[i] = false;
+        }
+
+        for (string::size_type i = 0; i < head.size(); ++i)
+            headset[static_cast<unsigned char>(head[i])] = true;
+        for (string::size_type i = 0; i < body.size(); ++i)
+            bodyset[static_cast<unsigned char>(body[i])] = true;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void tokenizer_kernel_1::
+    set_stream (
+        std::istream& in_
+    )
+    {
+        in = &in_;
+        streambuf = in_.rdbuf();
+        have_peeked = false;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    bool tokenizer_kernel_1::
+    stream_is_set (
+    ) const
+    {
+        return (in != 0);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    std::istream& tokenizer_kernel_1::
+    get_stream (
+    ) const
+    {
+        return *in;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void tokenizer_kernel_1::
+    get_token (
+        int& type,
+        std::string& token
+    )
+    {
+        if (!have_peeked)
+        {
+            std::streambuf::int_type ch;
+            ch = streambuf->sbumpc();
+
+            switch (ch)
+            {
+            case EOF:
+                type = END_OF_FILE;
+                token.clear();
+                return;
+
+            case '\n':
+                type = END_OF_LINE;
+                token = "\n";
+                return;
+
+            case '\r':
+            case ' ':
+            case '\t':
+                type = WHITE_SPACE;
+                token = static_cast<char>(ch);
+                ch = streambuf->sgetc();
+                while ((ch == ' ' || ch == '\t' || ch == '\r') && ch != EOF)
+                {
+                    token += static_cast<char>(ch);
+                    ch = streambuf->snextc();
+                }
+                return;
+
+            default:
+                if (headset[static_cast<unsigned char>(ch)])
+                {
+                    type = IDENTIFIER;
+                    token = static_cast<char>(ch);
+                    ch = streambuf->sgetc();
+                    while ( bodyset[static_cast<unsigned char>(ch)] && ch != EOF )
+                    {
+                        token += static_cast<char>(ch);
+                        ch = streambuf->snextc();
+                    }
+                }
+                else if ('0' <= ch && ch <= '9')
+                {
+                    type = NUMBER;
+                    token = static_cast<char>(ch);
+                    ch = streambuf->sgetc();
+                    while (('0' <= ch && ch <= '9') && ch != EOF)
+                    {
+                        token += static_cast<char>(ch);
+                        ch = streambuf->snextc();
+                    }
+                }
+                else
+                {
+                    type = CHAR;
+                    token = static_cast<char>(ch);
+                }
+                return;
+            } // switch (ch)
+        }
+        
+        // if we get this far it means we have peeked so we should 
+        // return the peek data.
+        type = next_type;
+        token = next_token;
+        have_peeked = false;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    int tokenizer_kernel_1::
+    peek_type (
+    ) const
+    {
+        const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token);
+        have_peeked = true;
+        return next_type;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    const std::string& tokenizer_kernel_1::
+    peek_token (
+    ) const
+    {
+        const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token);
+        have_peeked = true;
+        return next_token;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    void tokenizer_kernel_1::
+    swap (
+        tokenizer_kernel_1& item
+    )
+    {
+        exchange(in,item.in);
+        exchange(streambuf,item.streambuf);
+        exchange(head,item.head);
+        exchange(body,item.body);
+        exchange(bodyset,item.bodyset);
+        exchange(headset,item.headset);
+        exchange(have_peeked,item.have_peeked);
+        exchange(next_type,item.next_type);
+        exchange(next_token,item.next_token);
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    void tokenizer_kernel_1::
+    set_identifier_token (
+        const std::string& head_,
+        const std::string& body_
+    )
+    {
+        using namespace std;
+
+        head = head_;
+        body = body_;
+
+        for (unsigned long i = 0; i < UCHAR_MAX; ++i)
+        {
+            headset[i] = false;
+            bodyset[i] = false;
+        }
+
+        for (string::size_type i = 0; i < head.size(); ++i)
+            headset[static_cast<unsigned char>(head[i])] = true;
+        for (string::size_type i = 0; i < body.size(); ++i)
+            bodyset[static_cast<unsigned char>(body[i])] = true;
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    const std::string tokenizer_kernel_1::
+    get_identifier_head (
+    ) const
+    {
+        return head;
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    const std::string tokenizer_kernel_1::
+    get_identifier_body (
+    ) const
+    {
+        return body;
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    const std::string tokenizer_kernel_1::
+    lowercase_letters (
+    ) const
+    {
+        return std::string("abcdefghijklmnopqrstuvwxyz");
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    const std::string tokenizer_kernel_1::
+    uppercase_letters (
+    ) const
+    {
+        return std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+    }
+
+// ----------------------------------------------------------------------------------------
+    
+    const std::string tokenizer_kernel_1::
+    numbers (
+    ) const
+    {
+        return std::string("0123456789");
+    }
+    
+// ----------------------------------------------------------------------------------------
+    
+}
+#endif // DLIB_TOKENIZER_KERNEL_1_CPp_
+
diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h
new file mode 100644
index 000000000..d67ae278f
--- /dev/null
+++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h
@@ -0,0 +1,155 @@
+// Copyright (C) 2005  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_TOKENIZER_KERNEl_1_
+#define DLIB_TOKENIZER_KERNEl_1_
+
+#include <string>
+#include <iosfwd>
+#include <climits>
+#include "../algs.h"
+#include "tokenizer_kernel_abstract.h"
+
+namespace dlib
+{
+
+    class tokenizer_kernel_1 
+    {
+        /*!
+            INITIAL VALUE
+                - in == 0
+                - streambuf == 0
+                - have_peeked == false
+                - head == "_" + lowercase_letters() + uppercase_letters()
+                - body == "_" + lowercase_letters() + uppercase_letters() + numbers()
+                - headset == pointer to an array of UCHAR_MAX bools and set according 
+                  to the CONVENTION.
+                - bodyset == pointer to an array of UCHAR_MAX bools and set according 
+                  to the CONVENTION.
+
+            CONVENTION  
+                - if (stream_is_set()) then
+                    - get_stream() == *in
+                    - streambuf == in->rdbuf()
+                - else
+                    - in == 0
+                    - streambuf == 0
+
+                - body == get_identifier_body()
+                - head == get_identifier_head()
+
+                - if (the char x appears in head) then
+                    - headset[static_cast<unsigned char>(x)] == true
+                - else
+                    - headset[static_cast<unsigned char>(x)] == false
+
+                - if (the char x appears in body) then
+                    - bodyset[static_cast<unsigned char>(x)] == true
+                - else
+                    - bodyset[static_cast<unsigned char>(x)] == false
+
+                - if (have_peeked) then
+                    - next_token == the next token to be returned from get_token()
+                    - next_type == the type of token in peek_token
+        !*/
+
+    public:
+
+        // The name of this enum is irrelevant but on some compilers (gcc on MAC OS X) not having it named
+        // causes an error for whatever reason
+        enum some_random_name
+        {
+            END_OF_LINE,
+            END_OF_FILE,
+            IDENTIFIER,
+            CHAR,
+            NUMBER,
+            WHITE_SPACE
+        };
+
+        tokenizer_kernel_1 (        
+        );
+
+        virtual ~tokenizer_kernel_1 (
+        );
+
+        void clear(
+        );
+
+        void set_stream (
+            std::istream& in
+        );
+
+        bool stream_is_set (
+        ) const;
+
+        std::istream& get_stream (
+        ) const;
+
+        void get_token (
+            int& type,
+            std::string& token
+        );
+
+        void swap (
+            tokenizer_kernel_1& item
+        );
+
+        void set_identifier_token (
+            const std::string& head,
+            const std::string& body
+        );
+
+        int peek_type (
+        ) const;
+
+        const std::string& peek_token (
+        ) const;
+
+        const std::string get_identifier_head (
+        ) const;
+
+        const std::string get_identifier_body (
+        ) const;
+
+        const std::string lowercase_letters (
+        ) const;
+
+        const std::string uppercase_letters (
+        ) const;
+
+        const std::string numbers (
+        ) const;
+
+    private:
+
+        // restricted functions
+        tokenizer_kernel_1(const tokenizer_kernel_1&);        // copy constructor
+        tokenizer_kernel_1& operator=(const tokenizer_kernel_1&);    // assignment operator
+
+
+        // data members
+        std::istream* in;
+        std::streambuf* streambuf;
+        std::string head;
+        std::string body;
+        bool* headset;
+        bool* bodyset;
+
+        mutable std::string next_token;
+        mutable int next_type;
+        mutable bool have_peeked;
+    };    
+
+    inline void swap (
+        tokenizer_kernel_1& a, 
+        tokenizer_kernel_1& b 
+    ) { a.swap(b); }   
+
+}
+
+#ifdef NO_MAKEFILE
+#include "tokenizer_kernel_1.cpp"
+#endif
+
+#endif // DLIB_TOKENIZER_KERNEl_1
+
diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h
new file mode 100644
index 000000000..f534b8f7f
--- /dev/null
+++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h
@@ -0,0 +1,289 @@
+// Copyright (C) 2005  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_TOKENIZER_KERNEl_ABSTRACT_
+#ifdef DLIB_TOKENIZER_KERNEl_ABSTRACT_
+
+#include <string>
+#include <ioswfd>
+
+namespace dlib
+{
+
+    class tokenizer 
+    {
+        /*!
+            INITIAL VALUE
+                stream_is_set() == false
+                get_identifier_head() == "_" + lowercase_letters() + uppercase_letters()
+                get_identifier_body() == "_" + lowercase_letters() + uppercase_letters() + 
+                                         numbers()
+
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a simple tokenizer for textual data.
+
+            BUFFERING
+                This object is allowed to buffer data from the input stream.
+                Thus if you clear it or switch streams (via calling set_stream())
+                any buffered data will be lost.
+
+            TOKENS
+                When picking out tokens the tokenizer will always extract the 
+                longest token it can.  For example, if faced with the string 
+                "555" it will consider the three 5s to be a single NUMBER 
+                token not three smaller NUMBER tokens.
+
+                Also note that no characters in the input stream are discarded.
+                They will all be returned in the text of some token.  
+                Additionally, each character will never be returned more than once.  
+                This means that if you concatenated all returned tokens it would exactly
+                reproduce the contents of the input stream.
+
+                The tokens are defined as follows:
+
+                END_OF_LINE
+                    This is a single character token and is always the '\n' 
+                    character.
+
+                END_OF_FILE
+                    This token represents the end of file.  It doesn't have any
+                    actual characters associated with it.  
+
+                IDENTIFIER
+                    This is a multi-character token.  It is defined as a string that
+                    begins with a character from get_identifier_head() and is 
+                    followed by any number of characters from get_identifier_body().
+                       
+                NUMBER
+                    This is a multi-character token.  It is defined as a sequence of
+                    numbers. 
+
+                WHITE_SPACE
+                    This is a multi character token.  It is defined as a sequence of
+                    one or more spaces, carrage returns, and tabs.  I.e. It is
+                    composed of characters from the following string " \r\t".
+
+                CHAR
+                    This is a single character token.  It matches anything that isn't
+                    part of one of the above tokens.                    
+        !*/
+
+    public:
+
+        enum 
+        {
+            END_OF_LINE,
+            END_OF_FILE,
+            IDENTIFIER,
+            CHAR,
+            NUMBER,
+            WHITE_SPACE
+        };
+
+        tokenizer (        
+        );
+        /*!
+            ensures                
+                - #*this is properly initialized
+            throws
+                - std::bad_alloc
+        !*/
+
+        virtual ~tokenizer (
+        );
+        /*!
+            ensures
+                - any resources associated with *this have been released
+        !*/
+
+        void clear(
+        );
+        /*!
+            ensures
+                - #*this has its initial value
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then #*this is unusable 
+                    until clear() is called and succeeds.
+        !*/
+
+        void set_stream (
+            std::istream& in
+        );
+        /*!
+            ensures
+                - #*this will read data from in and tokenize it
+                - #stream_is_set() == true
+                - #get_stream() == in
+        !*/
+
+        bool stream_is_set (
+        ) const;
+        /*!
+            ensures
+                - returns true if a stream has been associated with *this by calling
+                  set_stream()
+        !*/
+
+        std::istream& get_stream (
+        ) const;
+        /*!
+            requires
+                - stream_is_set() == true
+            ensures
+                - returns a reference to the istream object that *this is reading 
+                  from.
+        !*/
+
+        void get_token (
+            int& type,
+            std::string& token
+        );
+        /*!
+            requires
+                - stream_is_set() == true
+            ensures
+                - #token == the next token from the input stream get_stream()
+                - #type == the type of the token in #token
+            throws
+                - bad_alloc
+                    If this exception is thrown then the call to this function will 
+                    have no effect on *this but the values of #type and #token will be 
+                    undefined.  Additionally, some characters may have been read
+                    from the stream get_stream() and lost.
+        !*/
+
+        int peek_type (
+        ) const;
+        /*!
+            requires
+                - stream_is_set() == true
+            ensures
+                - returns the type of the token that will be returned from
+                  the next call to get_token()
+            throws
+                - bad_alloc
+                    If this exception is thrown then the call to this function will 
+                    have no effect on *this.  However, some characters may have been 
+                    read from the stream get_stream() and lost.
+        !*/
+
+        const std::string& peek_token (
+        ) const;
+        /*!
+            requires
+                - stream_is_set() == true
+            ensures
+                - returns the text of the token that will be returned from
+                  the next call to get_token()
+            throws
+                - bad_alloc
+                    If this exception is thrown then the call to this function will 
+                    have no effect on *this.  However, some characters may have been 
+                    read from the stream get_stream() and lost.
+        !*/
+
+        void set_identifier_token (
+            const std::string& head,
+            const std::string& body
+        );
+        /*!
+            requires
+                - head.find_first_of(" \r\t\n0123456789") == std::string::npos
+                  (i.e. head doesn't contain any characters from the string
+                  " \r\t\n0123456789").
+                - body.find_frst_of(" \r\t\n") == std::string::npos
+                  (i.e. body doesn't contain any characters from the string " \r\t\n").
+            ensures
+                - #get_identifier_head() == head
+                - #get_identifier_body() == body
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then #*this is unusable 
+                    until clear() is called and succeeds.
+        !*/
+
+        const std::string get_identifier_head (
+        ) const;
+        /*!
+            ensures
+                - returns a string containing the characters that can be the start
+                  of an IDENTIFIER token.
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then the call to this function
+                    has no effect.
+        !*/
+
+        const std::string get_identifier_body (
+        ) const;
+        /*!
+            ensures
+                - returns a string containing the characters that can appear in the
+                  body of an IDENTIFIER token.
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then the call to this function
+                    has no effect.
+        !*/
+
+        const std::string lowercase_letters (
+        ) const;
+        /*!
+            ensures
+                - returns "abcdefghijklmnopqrstuvwxyz"
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then the call to this function
+                    has no effect.
+        !*/
+
+        const std::string uppercase_letters (
+        ) const;
+        /*!
+            ensures
+                - returns "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then the call to this function
+                    has no effect.
+        !*/
+
+        const std::string numbers (
+        ) const;
+        /*!
+            ensures
+                - returns "0123456789"
+            throws
+                - std::bad_alloc
+                    If this exception is thrown then the call to this function
+                    has no effect.
+        !*/
+
+        void swap (
+            tokenizer& item
+        );
+        /*!
+            ensures
+                - swaps *this and item
+        !*/ 
+
+    private:
+
+        // restricted functions
+        tokenizer(const tokenizer&);        // copy constructor
+        tokenizer& operator=(const tokenizer&);    // assignment operator
+
+    };    
+
+    inline void swap (
+        tokenizer& a, 
+        tokenizer& b 
+    ) { a.swap(b); }   
+    /*!
+        provides a global swap function
+    !*/
+
+}
+
+#endif // DLIB_TOKENIZER_KERNEl_ABSTRACT_
+
diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h
new file mode 100644
index 000000000..f9604809d
--- /dev/null
+++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h
@@ -0,0 +1,167 @@
+// Copyright (C) 2003  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_TOKENIZER_KERNEl_C_
+#define DLIB_TOKENIZER_KERNEl_C_
+
+#include "tokenizer_kernel_abstract.h"
+#include "../assert.h"
+#include <string>
+#include <iostream>
+
+namespace dlib
+{
+
+    template <
+        typename tokenizer
+        >
+    class tokenizer_kernel_c : public tokenizer
+    {
+        
+        public:
+            std::istream& get_stream (
+            ) const;
+
+            void get_token (
+                int& type,
+                std::string& token
+            );
+
+            void set_identifier_token (
+                const std::string& head,
+                const std::string& body
+            );
+
+            int peek_type (
+            ) const;
+
+            const std::string& peek_token (
+            ) const;
+    };
+
+    template <
+        typename tokenizer
+        >
+    inline void swap (
+        tokenizer_kernel_c<tokenizer>& a, 
+        tokenizer_kernel_c<tokenizer>& b 
+    ) { a.swap(b); }  
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+    // member function definitions
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename tokenizer
+        >
+    void tokenizer_kernel_c<tokenizer>::
+    set_identifier_token (
+        const std::string& head,
+        const std::string& body
+    ) 
+    {
+        using namespace std;
+        // make sure requires clause is not broken
+        DLIB_CASSERT( head.find_first_of(" \r\t\n0123456789") == string::npos &&
+                body.find_first_of(" \r\t\n") == string::npos ,
+            "\tvoid tokenizer::set_identifier_token()"
+            << "\n\tyou can't define the IDENTIFIER token this way."
+            << "\n\thead: " << head
+            << "\n\tbody: " << body
+            << "\n\tthis: " << this
+            );
+
+        // call the real function
+        tokenizer::set_identifier_token(head,body);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename tokenizer
+        >
+    std::istream& tokenizer_kernel_c<tokenizer>::
+    get_stream (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT( this->stream_is_set() == true,
+            "\tstd::istream& tokenizer::get_stream()"
+            << "\n\tyou must set a stream for this object before you can get it"
+            << "\n\tthis: " << this
+            );
+
+        // call the real function
+        return tokenizer::get_stream();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename tokenizer
+        >
+    int tokenizer_kernel_c<tokenizer>::
+    peek_type (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT( this->stream_is_set() == true,
+            "\tint tokenizer::peek_type()"
+            << "\n\tyou must set a stream for this object before you peek at what it contains"
+            << "\n\tthis: " << this
+            );
+
+        // call the real function
+        return tokenizer::peek_type();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename tokenizer
+        >
+    const std::string& tokenizer_kernel_c<tokenizer>::
+    peek_token (
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT( this->stream_is_set() == true,
+            "\tint tokenizer::peek_token()"
+            << "\n\tyou must set a stream for this object before you peek at what it contains"
+            << "\n\tthis: " << this
+            );
+
+        // call the real function
+        return tokenizer::peek_token();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename tokenizer
+        >
+    void tokenizer_kernel_c<tokenizer>::
+    get_token (
+        int& type,
+        std::string& token
+    )
+    {
+        // make sure requires clause is not broken
+        DLIB_CASSERT( this->stream_is_set() == true,
+            "\tvoid tokenizer::get_token()"
+            << "\n\tyou must set a stream for this object before you can get tokens from it."
+            << "\n\tthis: " << this
+            );
+
+        // call the real function
+        tokenizer::get_token(type,token);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_TOKENIZER_KERNEl_C_
+
+