diff options
Diffstat (limited to 'ml/dlib/dlib/tokenizer')
-rw-r--r-- | ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp | 295 | ||||
-rw-r--r-- | ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h | 155 | ||||
-rw-r--r-- | ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h | 289 | ||||
-rw-r--r-- | ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h | 167 |
4 files changed, 906 insertions, 0 deletions
diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp new file mode 100644 index 000000000..daa83184c --- /dev/null +++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.cpp @@ -0,0 +1,295 @@ +// Copyright (C) 2005 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_TOKENIZER_KERNEL_1_CPp_ +#define DLIB_TOKENIZER_KERNEL_1_CPp_ +#include "tokenizer_kernel_1.h" + +#include <iostream> +#include <cstdio> + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + tokenizer_kernel_1:: + tokenizer_kernel_1 ( + ) : + headset(0), + bodyset(0), + have_peeked(false) + { + try + { + headset = new bool[UCHAR_MAX]; + bodyset = new bool[UCHAR_MAX]; + + clear(); + } + catch (...) + { + if (headset) delete [] headset; + if (bodyset) delete [] bodyset; + throw; + } + } + +// ---------------------------------------------------------------------------------------- + + tokenizer_kernel_1:: + ~tokenizer_kernel_1 ( + ) + { + delete [] bodyset; + delete [] headset; + } + +// ---------------------------------------------------------------------------------------- + + void tokenizer_kernel_1:: + clear( + ) + { + using namespace std; + + in = 0; + streambuf = 0; + have_peeked = false; + + head = "_" + lowercase_letters() + uppercase_letters(); + body = "_" + lowercase_letters() + uppercase_letters() + numbers(); + + for (unsigned long i = 0; i < UCHAR_MAX; ++i) + { + headset[i] = false; + bodyset[i] = false; + } + + for (string::size_type i = 0; i < head.size(); ++i) + headset[static_cast<unsigned char>(head[i])] = true; + for (string::size_type i = 0; i < body.size(); ++i) + bodyset[static_cast<unsigned char>(body[i])] = true; + } + +// ---------------------------------------------------------------------------------------- + + void tokenizer_kernel_1:: + set_stream ( + std::istream& in_ + ) + { + in = &in_; + streambuf = in_.rdbuf(); + have_peeked = false; + } + +// ---------------------------------------------------------------------------------------- + + bool tokenizer_kernel_1:: + stream_is_set ( + ) const + { + return (in != 0); + } + +// ---------------------------------------------------------------------------------------- + + std::istream& tokenizer_kernel_1:: + get_stream ( + ) const + { + return *in; + } + +// ---------------------------------------------------------------------------------------- + + void tokenizer_kernel_1:: + get_token ( + int& type, + std::string& token + ) + { + if (!have_peeked) + { + std::streambuf::int_type ch; + ch = streambuf->sbumpc(); + + switch (ch) + { + case EOF: + type = END_OF_FILE; + token.clear(); + return; + + case '\n': + type = END_OF_LINE; + token = "\n"; + return; + + case '\r': + case ' ': + case '\t': + type = WHITE_SPACE; + token = static_cast<char>(ch); + ch = streambuf->sgetc(); + while ((ch == ' ' || ch == '\t' || ch == '\r') && ch != EOF) + { + token += static_cast<char>(ch); + ch = streambuf->snextc(); + } + return; + + default: + if (headset[static_cast<unsigned char>(ch)]) + { + type = IDENTIFIER; + token = static_cast<char>(ch); + ch = streambuf->sgetc(); + while ( bodyset[static_cast<unsigned char>(ch)] && ch != EOF ) + { + token += static_cast<char>(ch); + ch = streambuf->snextc(); + } + } + else if ('0' <= ch && ch <= '9') + { + type = NUMBER; + token = static_cast<char>(ch); + ch = streambuf->sgetc(); + while (('0' <= ch && ch <= '9') && ch != EOF) + { + token += static_cast<char>(ch); + ch = streambuf->snextc(); + } + } + else + { + type = CHAR; + token = static_cast<char>(ch); + } + return; + } // switch (ch) + } + + // if we get this far it means we have peeked so we should + // return the peek data. + type = next_type; + token = next_token; + have_peeked = false; + } + +// ---------------------------------------------------------------------------------------- + + int tokenizer_kernel_1:: + peek_type ( + ) const + { + const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token); + have_peeked = true; + return next_type; + } + +// ---------------------------------------------------------------------------------------- + + const std::string& tokenizer_kernel_1:: + peek_token ( + ) const + { + const_cast<tokenizer_kernel_1*>(this)->get_token(next_type,next_token); + have_peeked = true; + return next_token; + } + +// ---------------------------------------------------------------------------------------- + + void tokenizer_kernel_1:: + swap ( + tokenizer_kernel_1& item + ) + { + exchange(in,item.in); + exchange(streambuf,item.streambuf); + exchange(head,item.head); + exchange(body,item.body); + exchange(bodyset,item.bodyset); + exchange(headset,item.headset); + exchange(have_peeked,item.have_peeked); + exchange(next_type,item.next_type); + exchange(next_token,item.next_token); + } + +// ---------------------------------------------------------------------------------------- + + void tokenizer_kernel_1:: + set_identifier_token ( + const std::string& head_, + const std::string& body_ + ) + { + using namespace std; + + head = head_; + body = body_; + + for (unsigned long i = 0; i < UCHAR_MAX; ++i) + { + headset[i] = false; + bodyset[i] = false; + } + + for (string::size_type i = 0; i < head.size(); ++i) + headset[static_cast<unsigned char>(head[i])] = true; + for (string::size_type i = 0; i < body.size(); ++i) + bodyset[static_cast<unsigned char>(body[i])] = true; + } + +// ---------------------------------------------------------------------------------------- + + const std::string tokenizer_kernel_1:: + get_identifier_head ( + ) const + { + return head; + } + +// ---------------------------------------------------------------------------------------- + + const std::string tokenizer_kernel_1:: + get_identifier_body ( + ) const + { + return body; + } + +// ---------------------------------------------------------------------------------------- + + const std::string tokenizer_kernel_1:: + lowercase_letters ( + ) const + { + return std::string("abcdefghijklmnopqrstuvwxyz"); + } + +// ---------------------------------------------------------------------------------------- + + const std::string tokenizer_kernel_1:: + uppercase_letters ( + ) const + { + return std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + } + +// ---------------------------------------------------------------------------------------- + + const std::string tokenizer_kernel_1:: + numbers ( + ) const + { + return std::string("0123456789"); + } + +// ---------------------------------------------------------------------------------------- + +} +#endif // DLIB_TOKENIZER_KERNEL_1_CPp_ + diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h new file mode 100644 index 000000000..d67ae278f --- /dev/null +++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_1.h @@ -0,0 +1,155 @@ +// Copyright (C) 2005 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_TOKENIZER_KERNEl_1_ +#define DLIB_TOKENIZER_KERNEl_1_ + +#include <string> +#include <iosfwd> +#include <climits> +#include "../algs.h" +#include "tokenizer_kernel_abstract.h" + +namespace dlib +{ + + class tokenizer_kernel_1 + { + /*! + INITIAL VALUE + - in == 0 + - streambuf == 0 + - have_peeked == false + - head == "_" + lowercase_letters() + uppercase_letters() + - body == "_" + lowercase_letters() + uppercase_letters() + numbers() + - headset == pointer to an array of UCHAR_MAX bools and set according + to the CONVENTION. + - bodyset == pointer to an array of UCHAR_MAX bools and set according + to the CONVENTION. + + CONVENTION + - if (stream_is_set()) then + - get_stream() == *in + - streambuf == in->rdbuf() + - else + - in == 0 + - streambuf == 0 + + - body == get_identifier_body() + - head == get_identifier_head() + + - if (the char x appears in head) then + - headset[static_cast<unsigned char>(x)] == true + - else + - headset[static_cast<unsigned char>(x)] == false + + - if (the char x appears in body) then + - bodyset[static_cast<unsigned char>(x)] == true + - else + - bodyset[static_cast<unsigned char>(x)] == false + + - if (have_peeked) then + - next_token == the next token to be returned from get_token() + - next_type == the type of token in peek_token + !*/ + + public: + + // The name of this enum is irrelevant but on some compilers (gcc on MAC OS X) not having it named + // causes an error for whatever reason + enum some_random_name + { + END_OF_LINE, + END_OF_FILE, + IDENTIFIER, + CHAR, + NUMBER, + WHITE_SPACE + }; + + tokenizer_kernel_1 ( + ); + + virtual ~tokenizer_kernel_1 ( + ); + + void clear( + ); + + void set_stream ( + std::istream& in + ); + + bool stream_is_set ( + ) const; + + std::istream& get_stream ( + ) const; + + void get_token ( + int& type, + std::string& token + ); + + void swap ( + tokenizer_kernel_1& item + ); + + void set_identifier_token ( + const std::string& head, + const std::string& body + ); + + int peek_type ( + ) const; + + const std::string& peek_token ( + ) const; + + const std::string get_identifier_head ( + ) const; + + const std::string get_identifier_body ( + ) const; + + const std::string lowercase_letters ( + ) const; + + const std::string uppercase_letters ( + ) const; + + const std::string numbers ( + ) const; + + private: + + // restricted functions + tokenizer_kernel_1(const tokenizer_kernel_1&); // copy constructor + tokenizer_kernel_1& operator=(const tokenizer_kernel_1&); // assignment operator + + + // data members + std::istream* in; + std::streambuf* streambuf; + std::string head; + std::string body; + bool* headset; + bool* bodyset; + + mutable std::string next_token; + mutable int next_type; + mutable bool have_peeked; + }; + + inline void swap ( + tokenizer_kernel_1& a, + tokenizer_kernel_1& b + ) { a.swap(b); } + +} + +#ifdef NO_MAKEFILE +#include "tokenizer_kernel_1.cpp" +#endif + +#endif // DLIB_TOKENIZER_KERNEl_1 + diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h new file mode 100644 index 000000000..f534b8f7f --- /dev/null +++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_abstract.h @@ -0,0 +1,289 @@ +// Copyright (C) 2005 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_TOKENIZER_KERNEl_ABSTRACT_ +#ifdef DLIB_TOKENIZER_KERNEl_ABSTRACT_ + +#include <string> +#include <ioswfd> + +namespace dlib +{ + + class tokenizer + { + /*! + INITIAL VALUE + stream_is_set() == false + get_identifier_head() == "_" + lowercase_letters() + uppercase_letters() + get_identifier_body() == "_" + lowercase_letters() + uppercase_letters() + + numbers() + + WHAT THIS OBJECT REPRESENTS + This object represents a simple tokenizer for textual data. + + BUFFERING + This object is allowed to buffer data from the input stream. + Thus if you clear it or switch streams (via calling set_stream()) + any buffered data will be lost. + + TOKENS + When picking out tokens the tokenizer will always extract the + longest token it can. For example, if faced with the string + "555" it will consider the three 5s to be a single NUMBER + token not three smaller NUMBER tokens. + + Also note that no characters in the input stream are discarded. + They will all be returned in the text of some token. + Additionally, each character will never be returned more than once. + This means that if you concatenated all returned tokens it would exactly + reproduce the contents of the input stream. + + The tokens are defined as follows: + + END_OF_LINE + This is a single character token and is always the '\n' + character. + + END_OF_FILE + This token represents the end of file. It doesn't have any + actual characters associated with it. + + IDENTIFIER + This is a multi-character token. It is defined as a string that + begins with a character from get_identifier_head() and is + followed by any number of characters from get_identifier_body(). + + NUMBER + This is a multi-character token. It is defined as a sequence of + numbers. + + WHITE_SPACE + This is a multi character token. It is defined as a sequence of + one or more spaces, carrage returns, and tabs. I.e. It is + composed of characters from the following string " \r\t". + + CHAR + This is a single character token. It matches anything that isn't + part of one of the above tokens. + !*/ + + public: + + enum + { + END_OF_LINE, + END_OF_FILE, + IDENTIFIER, + CHAR, + NUMBER, + WHITE_SPACE + }; + + tokenizer ( + ); + /*! + ensures + - #*this is properly initialized + throws + - std::bad_alloc + !*/ + + virtual ~tokenizer ( + ); + /*! + ensures + - any resources associated with *this have been released + !*/ + + void clear( + ); + /*! + ensures + - #*this has its initial value + throws + - std::bad_alloc + If this exception is thrown then #*this is unusable + until clear() is called and succeeds. + !*/ + + void set_stream ( + std::istream& in + ); + /*! + ensures + - #*this will read data from in and tokenize it + - #stream_is_set() == true + - #get_stream() == in + !*/ + + bool stream_is_set ( + ) const; + /*! + ensures + - returns true if a stream has been associated with *this by calling + set_stream() + !*/ + + std::istream& get_stream ( + ) const; + /*! + requires + - stream_is_set() == true + ensures + - returns a reference to the istream object that *this is reading + from. + !*/ + + void get_token ( + int& type, + std::string& token + ); + /*! + requires + - stream_is_set() == true + ensures + - #token == the next token from the input stream get_stream() + - #type == the type of the token in #token + throws + - bad_alloc + If this exception is thrown then the call to this function will + have no effect on *this but the values of #type and #token will be + undefined. Additionally, some characters may have been read + from the stream get_stream() and lost. + !*/ + + int peek_type ( + ) const; + /*! + requires + - stream_is_set() == true + ensures + - returns the type of the token that will be returned from + the next call to get_token() + throws + - bad_alloc + If this exception is thrown then the call to this function will + have no effect on *this. However, some characters may have been + read from the stream get_stream() and lost. + !*/ + + const std::string& peek_token ( + ) const; + /*! + requires + - stream_is_set() == true + ensures + - returns the text of the token that will be returned from + the next call to get_token() + throws + - bad_alloc + If this exception is thrown then the call to this function will + have no effect on *this. However, some characters may have been + read from the stream get_stream() and lost. + !*/ + + void set_identifier_token ( + const std::string& head, + const std::string& body + ); + /*! + requires + - head.find_first_of(" \r\t\n0123456789") == std::string::npos + (i.e. head doesn't contain any characters from the string + " \r\t\n0123456789"). + - body.find_frst_of(" \r\t\n") == std::string::npos + (i.e. body doesn't contain any characters from the string " \r\t\n"). + ensures + - #get_identifier_head() == head + - #get_identifier_body() == body + throws + - std::bad_alloc + If this exception is thrown then #*this is unusable + until clear() is called and succeeds. + !*/ + + const std::string get_identifier_head ( + ) const; + /*! + ensures + - returns a string containing the characters that can be the start + of an IDENTIFIER token. + throws + - std::bad_alloc + If this exception is thrown then the call to this function + has no effect. + !*/ + + const std::string get_identifier_body ( + ) const; + /*! + ensures + - returns a string containing the characters that can appear in the + body of an IDENTIFIER token. + throws + - std::bad_alloc + If this exception is thrown then the call to this function + has no effect. + !*/ + + const std::string lowercase_letters ( + ) const; + /*! + ensures + - returns "abcdefghijklmnopqrstuvwxyz" + throws + - std::bad_alloc + If this exception is thrown then the call to this function + has no effect. + !*/ + + const std::string uppercase_letters ( + ) const; + /*! + ensures + - returns "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + throws + - std::bad_alloc + If this exception is thrown then the call to this function + has no effect. + !*/ + + const std::string numbers ( + ) const; + /*! + ensures + - returns "0123456789" + throws + - std::bad_alloc + If this exception is thrown then the call to this function + has no effect. + !*/ + + void swap ( + tokenizer& item + ); + /*! + ensures + - swaps *this and item + !*/ + + private: + + // restricted functions + tokenizer(const tokenizer&); // copy constructor + tokenizer& operator=(const tokenizer&); // assignment operator + + }; + + inline void swap ( + tokenizer& a, + tokenizer& b + ) { a.swap(b); } + /*! + provides a global swap function + !*/ + +} + +#endif // DLIB_TOKENIZER_KERNEl_ABSTRACT_ + diff --git a/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h b/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h new file mode 100644 index 000000000..f9604809d --- /dev/null +++ b/ml/dlib/dlib/tokenizer/tokenizer_kernel_c.h @@ -0,0 +1,167 @@ +// Copyright (C) 2003 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_TOKENIZER_KERNEl_C_ +#define DLIB_TOKENIZER_KERNEl_C_ + +#include "tokenizer_kernel_abstract.h" +#include "../assert.h" +#include <string> +#include <iostream> + +namespace dlib +{ + + template < + typename tokenizer + > + class tokenizer_kernel_c : public tokenizer + { + + public: + std::istream& get_stream ( + ) const; + + void get_token ( + int& type, + std::string& token + ); + + void set_identifier_token ( + const std::string& head, + const std::string& body + ); + + int peek_type ( + ) const; + + const std::string& peek_token ( + ) const; + }; + + template < + typename tokenizer + > + inline void swap ( + tokenizer_kernel_c<tokenizer>& a, + tokenizer_kernel_c<tokenizer>& b + ) { a.swap(b); } + +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + // member function definitions +// ---------------------------------------------------------------------------------------- +// ---------------------------------------------------------------------------------------- + + template < + typename tokenizer + > + void tokenizer_kernel_c<tokenizer>:: + set_identifier_token ( + const std::string& head, + const std::string& body + ) + { + using namespace std; + // make sure requires clause is not broken + DLIB_CASSERT( head.find_first_of(" \r\t\n0123456789") == string::npos && + body.find_first_of(" \r\t\n") == string::npos , + "\tvoid tokenizer::set_identifier_token()" + << "\n\tyou can't define the IDENTIFIER token this way." + << "\n\thead: " << head + << "\n\tbody: " << body + << "\n\tthis: " << this + ); + + // call the real function + tokenizer::set_identifier_token(head,body); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename tokenizer + > + std::istream& tokenizer_kernel_c<tokenizer>:: + get_stream ( + ) const + { + // make sure requires clause is not broken + DLIB_CASSERT( this->stream_is_set() == true, + "\tstd::istream& tokenizer::get_stream()" + << "\n\tyou must set a stream for this object before you can get it" + << "\n\tthis: " << this + ); + + // call the real function + return tokenizer::get_stream(); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename tokenizer + > + int tokenizer_kernel_c<tokenizer>:: + peek_type ( + ) const + { + // make sure requires clause is not broken + DLIB_CASSERT( this->stream_is_set() == true, + "\tint tokenizer::peek_type()" + << "\n\tyou must set a stream for this object before you peek at what it contains" + << "\n\tthis: " << this + ); + + // call the real function + return tokenizer::peek_type(); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename tokenizer + > + const std::string& tokenizer_kernel_c<tokenizer>:: + peek_token ( + ) const + { + // make sure requires clause is not broken + DLIB_CASSERT( this->stream_is_set() == true, + "\tint tokenizer::peek_token()" + << "\n\tyou must set a stream for this object before you peek at what it contains" + << "\n\tthis: " << this + ); + + // call the real function + return tokenizer::peek_token(); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename tokenizer + > + void tokenizer_kernel_c<tokenizer>:: + get_token ( + int& type, + std::string& token + ) + { + // make sure requires clause is not broken + DLIB_CASSERT( this->stream_is_set() == true, + "\tvoid tokenizer::get_token()" + << "\n\tyou must set a stream for this object before you can get tokens from it." + << "\n\tthis: " << this + ); + + // call the real function + tokenizer::get_token(type,token); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_TOKENIZER_KERNEl_C_ + + |