diff options
Diffstat (limited to '')
-rw-r--r-- | debian/src/geoip-asn-csv-to-dat.cpp | 897 |
1 files changed, 897 insertions, 0 deletions
diff --git a/debian/src/geoip-asn-csv-to-dat.cpp b/debian/src/geoip-asn-csv-to-dat.cpp new file mode 100644 index 0000000..4db11cc --- /dev/null +++ b/debian/src/geoip-asn-csv-to-dat.cpp @@ -0,0 +1,897 @@ +/* geoip-csv-to-dat - convert a country database from CSV to GeoIP binary format + * + * Copyright (c) 2009 Kalle Olavi Niemitalo. + * Copyright (c) 2011 Patrick Matthäi + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define _GNU_SOURCE 1 +#include <algorithm> +#include <arpa/inet.h> +#include <cerrno> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <error.h> +#include <fstream> +#include <getopt.h> +#include <iostream> +#include <queue> +#include <set> +#include <stack> +#include <string> +#include <sys/socket.h> +#include <sys/types.h> +#include <sysexits.h> +#include <vector> +#include <GeoIP.h> +#include <map> + +// Format of GeoIP Country database files +// ====================================== +// +// 1. Binary trie mapping IP addresses to countries. +// 2. Optional unused data. +// 3. Optional database-info block. +// 4. Optional structure-info block. +// +// Binary trie +// ----------- +// +// The trie treats IP addresses as bit sequences and maps them to +// numbers. In the country database, each such number is 0xFFFF00 + +// the the country ID that GeoIP_id_by_ipnum() would return. The +// meanings of country IDs are hardcoded in libGeoIP and cannot be +// overridden by the database. +// +// The root node of the trie is at the beginning of the file, and the +// other nodes then follow it. Each node has the same size and +// consists of two little-endian pointers that correspond to the two +// possible values of a bit. In the country database, the pointers are +// 24-bit, and each node is thus 6 bytes long. +// +// Each pointer is one of: +// - The number that the whole lookup should return, i.e. 0xFFFF00 + id +// in the country database. +// - The number of the node that should be examined next, counting from +// 0 at the beginning of the file. Pointing back to nodes with +// smaller numbers is allowed, but loops are not allowed. +// +// Optional unused data +// -------------------- +// +// The file format seems to permit extra data between the binary trie +// and the optional blocks. +// +// Optional database-info block +// ---------------------------- +// +// Near the end of the file, there may be a three-byte tag (0x00 0x00 +// 0x00) followed by at most DATABASE_INFO_MAX_SIZE - 1 = 99 bytes of +// text that describes the database. GeoIP_database_info() returns +// this text and appends a terminating '\0'. +// +// The GeoLite Country IPv4 database downloadable from MaxMind +// includes this database-info block. +// +// Optional structure-info block +// ----------------------------- +// +// At the very end of the file, there may be a three-byte tag (0xFF +// 0xFF 0xFF) followed by at most STRUCTURE_INFO_MAX_SIZE - 1 = 19 +// bytes. The first byte is the database type, +// e.g. GEOIP_COUNTRY_EDITION = 1 or GEOIP_COUNTRY_EDITION_V6 = 12, +// possibly with 105 added to it. Type-specific information then +// follows. There is no type-specific information for the country +// editions. +// +// The GeoLite Country IPv4 database downloadable from MaxMind does +// not include this structure-info block. + +namespace { + class binary_trie + { + public: + typedef uint_fast32_t edge_type; + + explicit binary_trie(edge_type leaf); + void set_range( + const uint8_t range_min[], + const uint8_t range_max[], + std::size_t bit_count, + edge_type leaf); + void reorder_depth_first(); + void reorder_in_blocks(std::size_t bytes_per_block); + void write_binary(std::ostream &dat_stream) const; + void write_segment(std::ostream &dat_stream) const; + void update_records(); + + private: + struct node + { + edge_type edges[2]; + }; + std::vector<node> nodes; + + // This could be std::vector<bool> but that seems slower. + typedef std::vector<uint8_t> bits_vector; + + void set_range_in_node( + const bits_vector *min_bits, + const bits_vector *max_bits, + std::size_t bit_pos, + edge_type edit_node, + edge_type leaf); + void set_range_in_edge( + const bits_vector *min_bits, + const bits_vector *max_bits, + std::size_t bit_pos, + edge_type edit_node, + bool bit, + edge_type leaf); + void reorder( + const std::vector<edge_type> &old_to_new, + const std::vector<edge_type> &new_to_old); + }; +} + +/** Construct a binary trie and its root node. + * + * \param leaf + * Both edges of the root node will initially point to this leaf. + * The caller should provide a value that means nothing was found. */ +binary_trie::binary_trie(edge_type leaf) +{ + const node node = {{ leaf, leaf }}; + nodes.push_back(node); +} + +/** Edit the trie so it maps a range of bit sequences to the same + * leaf. + * + * \param range_min + * The first bit sequence in the range. Eight bits are packed in each + * byte. The most significant bit of the whole sequence is in the + * most significant bit of the first byte. + * + * \param range_max + * The last bit sequence in the range. + * + * \param bit_count + * The number of bits in both sequences. + * + * \param leaf + * The leaf to which all the bit sequences in the range should be + * mapped. */ +void +binary_trie::set_range( + const uint8_t range_min[], + const uint8_t range_max[], + std::size_t bit_count, + edge_type leaf) +{ + bits_vector min_bits(bit_count); + bits_vector max_bits(bit_count); + for (std::size_t i = 0; i < bit_count; ++i) { + std::size_t byte_pos = i / 8; + uint8_t mask = 1 << ((~i) % 8); + min_bits[i] = ((range_min[byte_pos] & mask) != 0); + max_bits[i] = ((range_max[byte_pos] & mask) != 0); + } + set_range_in_node(&min_bits, &max_bits, 0, 0, leaf); +} + +/** Edit a node in the trie so it maps a range of bit sequences to the + * same leaf. + * + * \param min_bits + * The first bit sequence in the range, or NULL if unbounded. + * + * \param max_bits + * The last bit sequence in the range, or NULL if unbounded. + * + * \param bit_pos + * Which bit in the sequences corresponds to \a edit_node. + * + * \param edit_node + * The node to be modified. + * + * \param leaf + * The leaf to which all the bit sequences in the range should be + * mapped. */ +void +binary_trie::set_range_in_node( + const bits_vector *min_bits, + const bits_vector *max_bits, + std::size_t bit_pos, + edge_type edit_node, + edge_type leaf) +{ + if (!min_bits || (*min_bits)[bit_pos] == false) { + set_range_in_edge(min_bits, + (max_bits && (*max_bits)[bit_pos] == false) + ? max_bits : NULL, + bit_pos + 1, edit_node, false, leaf); + } + if (!max_bits || (*max_bits)[bit_pos] == true) { + set_range_in_edge((min_bits && (*min_bits)[bit_pos] == true) + ? min_bits : NULL, + max_bits, + bit_pos + 1, edit_node, true, leaf); + } +} + +/** Edit an edge in the trie so it maps a range of bit sequences to + * the same leaf. + * + * \param min_bits + * The first bit sequence in the range, or NULL if unbounded. + * + * \param max_bits + * The last bit sequence in the range, or NULL if unbounded. + * + * \param bit_pos + * Which bit in the sequences corresponds to \a bit. + * + * \param edit_node + * The node in which the edge to be modified is located. + * + * \param bit + * Which edge of \a edit_node should be modified. + * + * \param leaf + * The leaf to which all the bit sequences in the range should be + * mapped. */ +void +binary_trie::set_range_in_edge( + const bits_vector *min_bits, + const bits_vector *max_bits, + std::size_t bit_pos, + edge_type edit_node, + bool bit, + edge_type leaf) +{ + // Check if the range fills this edge entirely. + bool entire = true; + if (min_bits + && std::find(min_bits->begin() + bit_pos, min_bits->end(), + true) != min_bits->end()) + entire = false; + if (max_bits + && std::find(max_bits->begin() + bit_pos, max_bits->end(), + false) != max_bits->end()) + entire = false; + + if (entire) { + nodes[edit_node].edges[bit] = leaf; + } else { + edge_type next = nodes[edit_node].edges[bit]; + if (next >= nodes.size()) { + const node new_node = {{ next, next }}; + next = nodes.size(); + nodes.push_back(new_node); + nodes[edit_node].edges[bit] = next; + } + + set_range_in_node(min_bits, max_bits, bit_pos, + next, leaf); + } +} + +/** Renumber the nodes in depth-first order. */ +void +binary_trie::reorder_depth_first() +{ + std::vector<edge_type> old_to_new, new_to_old; + std::stack<edge_type> depth_first; + old_to_new.resize(nodes.size(), -1); + new_to_old.reserve(nodes.size()); + depth_first.push(0); + while (!depth_first.empty()) { + const edge_type edge = depth_first.top(); + depth_first.pop(); + if (edge < nodes.size()) { + old_to_new[edge] = new_to_old.size(); + new_to_old.push_back(edge); + depth_first.push(nodes[edge].edges[1]); + depth_first.push(nodes[edge].edges[0]); + } + } + reorder(old_to_new, new_to_old); +} + +/** Renumber the nodes to make lookups use CPU and disk caches more + * effectively. + * + * First group the nodes into blocks so that each block contains the + * root of a subtrie and as many levels of its descendants as will + * fit. This way, after the root is paged in, the next few lookup + * steps need not page in anything else. Then, sort the nodes of each + * block in depth-first order. That should give each lookup almost + * 1/2 chance to find the next node immediately adjacent. + * + * With a block size of 1024 bytes, this renumbering reduces the time + * required for random lookups by about 1.1%, compared to a plain + * depth-first order. However, it's still 2.3% slower than the + * database optimized by MaxMind. */ +void +binary_trie::reorder_in_blocks( + std::size_t bytes_per_block) +{ + const edge_type none = -1; + std::vector<edge_type> old_to_new, new_to_old; + ssize_t bytes_left = bytes_per_block; + old_to_new.resize(nodes.size(), none); + new_to_old.reserve(nodes.size()); + for (edge_type subtrie = 0; subtrie < nodes.size(); ++subtrie) { + // If subtrie has already been added to the output, + // ignore it. + if (old_to_new[subtrie] != none) + continue; + + // Walk breadth-first from subtrie until we have a + // block full of nodes or the subtrie runs out. Don't + // add these nodes immediately to the output, however. + // Instead just list them in nodes_in_block. + std::set<edge_type> nodes_in_block; + std::queue<edge_type> breadth_first; + breadth_first.push(subtrie); + if (bytes_left <= 0) + bytes_left += bytes_per_block; + while (bytes_left > 0 && !breadth_first.empty()) { + edge_type edge = breadth_first.front(); + breadth_first.pop(); + if (edge >= nodes.size()) + continue; + + // Let the last node of the block straddle the + // block boundary. That's better than making + // the hotter first node do so. + bytes_left -= 6; + nodes_in_block.insert(edge); + + breadth_first.push(nodes[edge].edges[0]); + breadth_first.push(nodes[edge].edges[1]); + } + + // Add the nodes from nodes_in_block to the output in + // depth-first order. This assumes they are all + // reachable from subtrie. + std::stack<edge_type> depth_first; + depth_first.push(subtrie); + while (!depth_first.empty()) { + edge_type edge = depth_first.top(); + depth_first.pop(); + if (nodes_in_block.find(edge) + == nodes_in_block.end()) + continue; + + old_to_new[edge] = new_to_old.size(); + new_to_old.push_back(edge); + + depth_first.push(nodes[edge].edges[1]); + depth_first.push(nodes[edge].edges[0]); + } + } + reorder(old_to_new, new_to_old); +} + +void +binary_trie::reorder( + const std::vector<edge_type> &old_to_new, + const std::vector<edge_type> &new_to_old) +{ + std::vector<node> new_nodes; + new_nodes.reserve(new_to_old.size()); + for (std::vector<edge_type>::const_iterator + it = new_to_old.begin(); + it != new_to_old.end(); ++it) { + node new_node; + for (int bit = 0; bit <= 1; ++bit) { + edge_type old_edge = nodes[*it].edges[bit]; + if (old_edge < nodes.size()) + new_node.edges[bit] = old_to_new[old_edge]; + else + new_node.edges[bit] = old_edge; + } + new_nodes.push_back(new_node); + } + swap(new_nodes, nodes); +} + +/** Add the size of the trie (number of nodes) to data records*/ +void +binary_trie::update_records() +{ + for (std::vector<node>::iterator it = nodes.begin(); + it != nodes.end(); ++it) { + // previously, we commandeered the MSB in order to indicate which records + // were data records, rather than pointers to other nodes in the trie. + // Here, we remove that bit, and increment the record by the number of nodes, + // because this is how libGeoIP determines whether a node points to an entry + // inside the data section or another node. + for (int i = 0;i<2;++i) { + if (it->edges[i] == 0x80000000) + it->edges[i] = nodes.size(); + else if (it->edges[i] & 0x80000000) + { + it->edges[i] = (it->edges[i] & 0x7FFFFFFF) + nodes.size(); + } + } + } +} + +/** Write the 3 byte segment offset. **/ +void +binary_trie::write_segment(std::ostream &dat_stream) const +{ + int len = nodes.size(); + dat_stream << (char) (0xFF & len); + dat_stream << (char) (0xFF & (len >> 8)); + dat_stream << (char) (0xFF & (len >> 16)); +} + +/** Write the trie to a stream in GeoIP binary format. */ +void +binary_trie::write_binary(std::ostream &dat_stream) const +{ + for (std::vector<node>::const_iterator it = nodes.begin(); + it != nodes.end(); ++it) { + union { + uint8_t bytes[6]; + char chars[6]; + } binary = {{ + (it->edges[0] ) & 0xFF, + (it->edges[0] >> 8) & 0xFF, + (it->edges[0] >> 16) & 0xFF, + (it->edges[1] ) & 0xFF, + (it->edges[1] >> 8) & 0xFF, + (it->edges[1] >> 16) & 0xFF + }}; + dat_stream.write(binary.chars, 6); + if (dat_stream.bad()) + return; + } +} + + +namespace { + void + v4_csv_line_to_vector( + const std::string line, + std::vector<std::string> &fields) + { + std::string buf(line); + std::string delim = ","; + std::size_t fs; + for(int i = 0; i<2;++i) { + fs = buf.find(delim); + fields.push_back(buf.substr(0,fs)); + buf.erase(0,fs + delim.length()); + } + if (buf[0] == '"' || buf[0] == '\'') + fields.push_back(buf.substr(1, buf.length() - 2)); + else + fields.push_back(buf); + } + + void + v6_csv_line_to_vector( + const std::string line, + std::vector<std::string> & fields) + { + std::string buf(line); + std::string delim = ","; + std::size_t fs; + for(int i = 0; i<3;++i) { + fs = buf.rfind(delim); + fields.push_back(buf.substr(fs+delim.length(), buf.length())); + buf.erase(fs,buf.length()); + } + if (buf[0] == '"' || buf[0] == '\'') + fields.push_back(buf.substr(1, buf.length() - 2)); + else + fields.push_back(buf.substr(0, buf.length())); + } + + /** Load ranges of IP addresses from a CSV-formatted stream to + * a trie. + * + * \param trie + * Load the ranges to this trie, overwriting original values. + * + * \param csv_file_name + * The name of the file that \a csv_stream is reading. + * This string is used only for error messages. + * + * \param csv_stream + * Load the ranges from this stream. + * + * \param address_family + * The type of IP addresses in the CSV data: either AF_INET * for IPv4 or AF_INET6 for IPv6. */ + void + csv_stream_to_trie_db( + binary_trie &trie, + std::string &database_segment, + const char *csv_file_name, + std::istream &csv_stream, + int address_family) + { + enum { + V4_CSV_FIELD_MIN_DECIMAL, + V4_CSV_FIELD_MAX_DECIMAL, + V4_CSV_FIELD_ASNUM_DESCRIPTION, + V4_CSV_FIELDS + }; + + enum { + V6_CSV_FIELD_NET_BITS, + V6_CSV_FIELD_MAX_TEXT, + V6_CSV_FIELD_MIN_TEXT, + V6_CSV_FIELD_ASNUM_DESCRIPTION, + V6_CSV_FIELDS + }; + + std::string csv_line; + std::map<std::string,int> segment_offset; + std::vector<std::string> csv_fields; + // create a map to track which as descriptions are added already + int csv_line_number = 0; + database_segment += '\0'; // padding so that record 0 is not at the start of the db. + while (getline(csv_stream, csv_line)) { + ++csv_line_number; + std::string as; + switch (address_family) { + case AF_INET: + v4_csv_line_to_vector(csv_line, csv_fields); + if (csv_fields.size() != V4_CSV_FIELDS) { + error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, + "Wrong number of fields"); + } + as = csv_fields[V4_CSV_FIELD_ASNUM_DESCRIPTION]; + if (segment_offset.find(as) == segment_offset.end()) { // no key found + segment_offset[as] = database_segment.length(); // start of record + database_segment += csv_fields[V4_CSV_FIELD_ASNUM_DESCRIPTION] + '\x00'; + } + break; + case AF_INET6: + v6_csv_line_to_vector(csv_line, csv_fields); + if (csv_fields.size() != V6_CSV_FIELDS) { + error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, + "Wrong number of fields"); + } + as = csv_fields[V6_CSV_FIELD_ASNUM_DESCRIPTION]; + if (segment_offset.find(as) == segment_offset.end()) { // no key found + segment_offset[as] = database_segment.length(); // start of record + database_segment += csv_fields[V6_CSV_FIELD_ASNUM_DESCRIPTION] + '\x00'; + } + break; + default: + abort(); + } + + // use the MSB to indicate that this is a data record. Later, the field + // is set to the segment offset + the nubmer of nodes in the trie. + const binary_trie::edge_type leaf = 0x80000000 | segment_offset[as]; + union { + struct in_addr inet; + uint8_t inetbytes[4]; + struct in6_addr inet6; + } minaddr, maxaddr; + + switch (address_family) { + case AF_INET: + inet_aton(csv_fields[V4_CSV_FIELD_MIN_DECIMAL].c_str(), &(minaddr.inet)); + inet_aton(csv_fields[V4_CSV_FIELD_MAX_DECIMAL].c_str(), &(maxaddr.inet)); + trie.set_range(minaddr.inetbytes, maxaddr.inetbytes, 32, leaf); + break; + + case AF_INET6: + if (inet_pton(address_family, csv_fields[V6_CSV_FIELD_MIN_TEXT].c_str(), &minaddr) <= 0) { + error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, + "Cannot parse minimum address: %s", + csv_fields[V6_CSV_FIELD_MIN_TEXT].c_str()); + } + if (inet_pton(address_family, csv_fields[V6_CSV_FIELD_MAX_TEXT].c_str(), &maxaddr) <= 0) { + error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, + "Cannot parse maximum address: %s", + csv_fields[V6_CSV_FIELD_MAX_TEXT].c_str()); + } + trie.set_range(minaddr.inet6.s6_addr, maxaddr.inet6.s6_addr, + 128, leaf); + break; + + default: + abort(); + } + csv_fields.clear(); + } + if (csv_stream.bad()) { + error(EX_IOERR, errno, "%s", csv_file_name); + } + } + + /** Load ranges of IP addresses from a CSV-formatted file or + * standard input to a trie. + * + * \param trie + * Load the ranges to this trie, overwriting original values. + * + * \param csv_file_name + * The name of the CSV file that should be read, or "-" for + * standard input. + * + * \param address_family + * The type of IP addresses in the CSV data: either AF_INET + * for IPv4 or AF_INET6 for IPv6. */ + void + csv_file_to_trie_db( + binary_trie &trie, + std::string &database_segment, + const char *csv_file_name, + int address_family) + { + if (std::strcmp(csv_file_name, "-") == 0) { + csv_stream_to_trie_db(trie, database_segment, csv_file_name, std::cin, address_family); + } else { + std::ifstream csv_stream(csv_file_name, std::ios::in); + if (!csv_stream) { + error(EX_NOINPUT, errno, "%s", csv_file_name); + } + csv_stream_to_trie_db(trie, database_segment, csv_file_name, csv_stream, address_family); + } + } + + /** Write a GeoIP binary database to a stream. + * + * \param trie + * Mapping from IP addresses to country codes or other values. + * + * \param dat_file_name + * The name of the file that \a dat_stream is writing. + * This string is used only for error messages. + * + * \param dat_stream + * Write the database to this stream. + * + * \param database_info + * Copyright or other information about the database, or NULL. + * GeoIP_database_info() will return this string. + * + * \param address_family + * The type of IP addresses in the database: either AF_INET + * for IPv4 or AF_INET6 for IPv6. */ + void + write_dat_stream( + const binary_trie &trie, + const char *dat_file_name, + std::ostream &dat_stream, + const char *database_info, + std::string database_segment, + int address_family) + { + + trie.write_binary(dat_stream); + if (dat_stream.bad()) { + error(EX_IOERR, errno, "%s", dat_file_name); + } + + // or open the file and read the length + if (database_segment.length() > 0) { + dat_stream << database_segment; + if (dat_stream.bad()) { + error(EX_IOERR, errno, "%s", dat_file_name); + } + + } + + // write the metadata section + if (database_info) { + const char tag[3] = { 0, 0, 0 }; + dat_stream.write(tag, 3); + dat_stream.write(database_info, std::strlen(database_info)); + if (dat_stream.bad()) { + error(EX_IOERR, errno, "%s", dat_file_name); + } + } + + switch (address_family) { + case AF_INET: { + const unsigned char structure_info[4] = { 0xFF, 0xFF, 0xFF, 9 }; + dat_stream.write((const char *)structure_info, 4); + break; + } + case AF_INET6: { + const unsigned char structure_info[4] = { 0xFF, 0xFF, 0xFF, 21 }; + dat_stream.write((const char *)structure_info, 4); + break; + } + default: + abort(); + } + + trie.write_segment(dat_stream); + if (dat_stream.bad()) { + error(EX_IOERR, errno, "%s", dat_file_name); + } + } + + /** Write a GeoIP binary database to a file or standard output. + * + * \param trie + * Mapping from IP addresses to country codes or other values. + * + * \param csv_file_name + * The name of the file that should be written, or "-" for + * standard output. + * + * \param database_info + * Copyright or other information about the database, or NULL. + * GeoIP_database_info() will return this string. + * + * \param address_family + * The type of IP addresses in the database: either AF_INET + * for IPv4 or AF_INET6 for IPv6. */ + void + write_dat_file( + const binary_trie &trie, + const char *dat_file_name, + const char *database_info, + std::string database_segment, + int address_family) + { + if (std::strcmp(dat_file_name, "-") == 0) { + write_dat_stream(trie, dat_file_name, std::cout, + database_info, database_segment, address_family); + } else { + std::ofstream dat_stream( + dat_file_name, + std::ios::out | std::ios::binary); + if (!dat_stream) { + error(EX_CANTCREAT, errno, "%s", dat_file_name); + } + write_dat_stream(trie, dat_file_name, dat_stream, + database_info, database_segment, address_family); + } + } + + struct cmdline { + const char *csv_file_name; + const char *dat_file_name; + int address_family; + const char *database_info; + bool verbose; + + cmdline(int argc, char **argv); + }; +} + +cmdline::cmdline(int argc, char **argv): + csv_file_name("-"), + dat_file_name("-"), + address_family(AF_INET), + database_info(NULL), + verbose(false) +{ + enum { + OPT_HELP = -2 + }; + + static const struct option long_options[] = { + { "info", required_argument, NULL, 'i' }, + { "output", required_argument, NULL, 'o' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, OPT_HELP }, + { NULL, 0, NULL, 0 } + }; + static const char *const usage = "\ +Usage: %s [OPTION] [CSV-FILE]...\n\ +Convert a country database from CSV to GeoIP binary format.\n\ +\n\ + -i, --info=TEXT add copyright or other info TEXT to output\n\ + -o, --output=FILE write the binary data to FILE, not stdout\n\ + -v, --verbose show what is going on\n\ + --help display this help and exit\n"; + + for (;;) { + int optret = getopt_long(argc, argv, "46i:o:v", long_options, NULL); + + if (optret == -1) + break; + switch (optret) { + case '4': + address_family = AF_INET; + break; + case '6': + address_family = AF_INET6; + break; + case 'i': + database_info = optarg; + if (std::strlen(database_info) > 99) { + error(EX_USAGE, 0, + "Database info must not be longer than 99 bytes"); + } + break; + case 'o': + dat_file_name = optarg; + break; + case 'v': + verbose = true; + break; + case OPT_HELP: + std::printf(usage, program_invocation_name); + std::exit(EX_OK); + case '?': + std::fprintf(stderr, + "Try `%s --help' for more information.\n", + program_invocation_name); + std::exit(EX_USAGE); + default: + std::abort(); + } + } + + if (optind < argc) + csv_file_name = argv[optind++]; + + if (optind < argc) { + error(EX_USAGE, 0, + "Only one non-option argument is allowed"); + } +} + +int +main(int argc, char **argv) +{ + cmdline cmdline(argc, argv); + + std::ostream *verbose_stream; + if (!cmdline.verbose) + verbose_stream = NULL; + else if (strcmp(cmdline.dat_file_name, "-") == 0) + verbose_stream = &std::cerr; + else + verbose_stream = &std::cout; + + if (verbose_stream) { + *verbose_stream << program_invocation_name + << ": Reading CSV and building the trie" + << std::endl; + } + /* Initialize the trie with a value that will point to the start of the + * data section, e.g. an empty record. See binary_trie::update_records() + */ + binary_trie trie(0x80000000); + std::string database_segment; + csv_file_to_trie_db(trie, database_segment, cmdline.csv_file_name, cmdline.address_family); + + if (verbose_stream) { + *verbose_stream << program_invocation_name + << ": Optimizing" << std::endl; + } + trie.reorder_depth_first(); + trie.reorder_in_blocks(1024); + trie.update_records(); + + if (verbose_stream) { + *verbose_stream << program_invocation_name + << ": Writing output" << std::endl; + } + write_dat_file(trie, cmdline.dat_file_name, cmdline.database_info, + database_segment, cmdline.address_family); + + if (verbose_stream) { + *verbose_stream << program_invocation_name + << ": All done" << std::endl; } +} |