diff options
Diffstat (limited to 'src/boost/libs/flyweight/example/html.cpp')
-rw-r--r-- | src/boost/libs/flyweight/example/html.cpp | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/src/boost/libs/flyweight/example/html.cpp b/src/boost/libs/flyweight/example/html.cpp new file mode 100644 index 00000000..1fa6d34c --- /dev/null +++ b/src/boost/libs/flyweight/example/html.cpp @@ -0,0 +1,321 @@ +/* Boost.Flyweight example of flyweight-based formatted text processing. + * + * Copyright 2006-2014 Joaquin M Lopez Munoz. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + * + * See http://www.boost.org/libs/flyweight for library home page. + */ + +#include <boost/flyweight.hpp> +#include <boost/functional/hash.hpp> +#include <algorithm> +#include <cctype> +#include <cstdio> +#include <fstream> +#include <iostream> +#include <iterator> +#include <sstream> +#include <string> +#include <vector> + +#if defined(BOOST_NO_STDC_NAMESPACE) +namespace std{using ::exit;using ::tolower;} +#endif + +using namespace boost::flyweights; + +/* An HTML tag consists of a name and optional properties of the form + * name1=value1 ... namen=valuen. We do not need to parse the properties + * for the purposes of the program, hence they are all stored in + * html_tag_data::properties in raw form. + */ + +struct html_tag_data +{ + std::string name; + std::string properties; +}; + +bool operator==(const html_tag_data& x,const html_tag_data& y) +{ + return x.name==y.name&&x.properties==y.properties; +} + +/* See the portability section of Boost.Hash at + * http://boost.org/doc/html/hash/portability.html + * for an explanation of the ADL-related workarounds. + */ + +#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) +namespace boost{ +#endif + +std::size_t hash_value(const html_tag_data& x) +{ + std::size_t res=0; + boost::hash_combine(res,x.name); + boost::hash_combine(res,x.properties); + return res; +} + +#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) +} /* namespace boost */ +#endif + +typedef flyweight<html_tag_data> html_tag; + +/* parse_tag is passed an iterator positioned at the first char of + * the tag after the opening '<' and returns, if succesful, a parsed tag + * and whether it is opening (<xx>) or closing (</xx>). + */ + +enum tag_type{opening,closing,failure}; + +struct parse_tag_res +{ + parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()): + type(type_),tag(tag_){} + parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){} + + tag_type type; + html_tag tag; +}; + +template<typename ForwardIterator> +parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last) +{ + html_tag_data tag; + std::string buf; + bool in_quote=false; + for(ForwardIterator it=first;it!=last;){ + char ch=*it++; + if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */ + tag_type type; + std::string::size_type + bname=buf.find_first_not_of("\t\n\r "), + ename=bname==std::string::npos? + std::string::npos: + buf.find_first_of("\t\n\r ",bname), + bprop=ename==std::string::npos? + std::string::npos: + buf.find_first_not_of("\t\n\r ",ename); + if(bname==ename){ /* null name */ + return parse_tag_res(failure); + } + else if(buf[bname]=='/'){ /* closing tag */ + type=closing; + ++bname; + } + else type=opening; + tag.name=buf.substr(bname,ename-bname); + std::transform( /* normalize tag name to lower case */ + tag.name.begin(),tag.name.end(),tag.name.begin(), + (int(*)(int))std::tolower); + if(bprop!=std::string::npos){ + tag.properties=buf.substr(bprop,buf.size()); + } + first=it; /* result good, consume the chars */ + return parse_tag_res(type,tag); + } + else{ + if(ch=='"')in_quote=!in_quote; + buf+=ch; + } + } + return parse_tag_res(failure); /* end reached and found no '>' */ +} + +/* A character context is just a vector containing the tags enclosing the + * character, from the outermost level to the innermost. + */ + +typedef std::vector<html_tag> html_context_data; +typedef flyweight<html_context_data> html_context; + +/* A character is a char code plus its context. + */ + +struct character_data +{ + character_data(char code_=0,html_context context_=html_context()): + code(code_),context(context_){} + character_data(const character_data& x):code(x.code),context(x.context){} + + char code; + html_context context; +}; + +bool operator==(const character_data& x,const character_data& y) +{ + return x.code==y.code&&x.context==y.context; +} + +#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) +namespace boost{ +#endif + +std::size_t hash_value(const character_data& x) +{ + std::size_t res=0; + boost::hash_combine(res,x.code); + boost::hash_combine(res,x.context); + return res; +} + +#if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) +} /* namespace boost */ +#endif + +typedef flyweight<character_data> character; + +/* scan_html converts HTML code into a stream of contextualized characters. + */ + +template<typename ForwardIterator,typename OutputIterator> +void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out) +{ + html_context_data context; + while(first!=last){ + if(*first=='<'){ /* tag found */ + ++first; + parse_tag_res res=parse_tag(first,last); + if(res.type==opening){ /* add to contex */ + context.push_back(res.tag); + continue; + } + else if(res.type==closing){ /* remove from context */ + /* Pop all tags from the innermost to the matching one; this takes + * care of missing </xx>s like vg. in <ul><li>hello</ul>. + */ + + for(html_context_data::reverse_iterator rit=context.rbegin(); + rit!=context.rend();++rit){ + if(rit->get().name==res.tag.get().name){ + context.erase(rit.base()-1,context.end()); + break; + } + } + continue; + } + } + *out++=character(*first++,html_context(context)); + } +} + +/* HTML-producing utilities */ + +void print_opening_tag(std::ostream& os,const html_tag_data& x) +{ + os<<"<"<<x.name; + if(!x.properties.empty())os<<" "<<x.properties; + os<<">"; +} + +void print_closing_tag(std::ostream& os,const html_tag_data& x) +{ + /* SGML declarations (beginning with '!') are not closed */ + + if(x.name[0]!='!')os<<"</"<<x.name<<">"; +} + +/* change_context takes contexts from and to with tags + * + * from<- c1 ... cn fn+1 ... fm + * to <- c1 ... cn tn+1 ... tk + * + * (that is, they share the first n tags, n might be 0), and + * produces code closing fm ... fn+1 and opening tn+1 ... tk. + */ + +template<typename OutputIterator> +void change_context( + const html_context_data& from,const html_context_data& to, + OutputIterator out) +{ + std::ostringstream oss; + html_context_data::const_iterator + it0=from.begin(), + it0_end=from.end(), + it1=to.begin(), + it1_end=to.end(); + for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1); + while(it0_end!=it0)print_closing_tag(oss,*--it0_end); + while(it1!=it1_end)print_opening_tag(oss,*it1++); + std::string str=oss.str(); + std::copy(str.begin(),str.end(),out); +} + +/* produce_html is passed a bunch of contextualized characters and emits + * the corresponding HTML. The algorithm is simple: tags are opened and closed + * as a result of the context from one character to the following changing. + */ + +template<typename ForwardIterator,typename OutputIterator> +void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out) +{ + html_context context; + while(first!=last){ + if(first->get().context!=context){ + change_context(context,first->get().context,out); + context=first->get().context; + } + *out++=(first++)->get().code; + } + change_context(context,html_context(),out); /* close remaining context */ +} + +/* Without these explicit instantiations, MSVC++ 6.5/7.0 does not + * find some friend operators in certain contexts. + */ + +character dummy1; +html_tag dummy2; + +int main() +{ + std::cout<<"input html file: "; + std::string in; + std::getline(std::cin,in); + std::ifstream ifs(in.c_str()); + if(!ifs){ + std::cout<<"can't open "<<in<<std::endl; + std::exit(EXIT_FAILURE); + } + typedef std::istreambuf_iterator<char> istrbuf_iterator; + std::vector<char> html_source; + std::copy( + istrbuf_iterator(ifs),istrbuf_iterator(), + std::back_inserter(html_source)); + + /* parse the HTML */ + + std::vector<character> scanned_html; + scan_html( + html_source.begin(),html_source.end(),std::back_inserter(scanned_html)); + + /* Now that we have the text as a vector of contextualized characters, + * we can shuffle it around and manipulate in almost any way we please. + * For instance, the following reverses the central portion of the doc. + */ + + std::reverse( + scanned_html.begin()+scanned_html.size()/4, + scanned_html.begin()+3*(scanned_html.size()/4)); + + /* emit the resulting HTML */ + + std::cout<<"output html file: "; + std::string out; + std::getline(std::cin,out); + std::ofstream ofs(out.c_str()); + if(!ofs){ + std::cout<<"can't open "<<out<<std::endl; + std::exit(EXIT_FAILURE); + } + typedef std::ostreambuf_iterator<char> ostrbuf_iterator; + produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs)); + + return 0; +} |