/* geoip-csv-to-dat - convert a country database from CSV to GeoIP binary format * * Copyright (c) 2009 Kalle Olavi Niemitalo. * Copyright (c) 2011 Patrick Matthäi * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // Format of GeoIP Country database files // ====================================== // // 1. Binary trie mapping IP addresses to countries. // 2. Optional unused data. // 3. Optional database-info block. // 4. Optional structure-info block. // // Binary trie // ----------- // // The trie treats IP addresses as bit sequences and maps them to // numbers. In the country database, each such number is 0xFFFF00 + // the the country ID that GeoIP_id_by_ipnum() would return. The // meanings of country IDs are hardcoded in libGeoIP and cannot be // overridden by the database. // // The root node of the trie is at the beginning of the file, and the // other nodes then follow it. Each node has the same size and // consists of two little-endian pointers that correspond to the two // possible values of a bit. In the country database, the pointers are // 24-bit, and each node is thus 6 bytes long. // // Each pointer is one of: // - The number that the whole lookup should return, i.e. 0xFFFF00 + id // in the country database. // - The number of the node that should be examined next, counting from // 0 at the beginning of the file. Pointing back to nodes with // smaller numbers is allowed, but loops are not allowed. // // Optional unused data // -------------------- // // The file format seems to permit extra data between the binary trie // and the optional blocks. // // Optional database-info block // ---------------------------- // // Near the end of the file, there may be a three-byte tag (0x00 0x00 // 0x00) followed by at most DATABASE_INFO_MAX_SIZE - 1 = 99 bytes of // text that describes the database. GeoIP_database_info() returns // this text and appends a terminating '\0'. // // The GeoLite Country IPv4 database downloadable from MaxMind // includes this database-info block. // // Optional structure-info block // ----------------------------- // // At the very end of the file, there may be a three-byte tag (0xFF // 0xFF 0xFF) followed by at most STRUCTURE_INFO_MAX_SIZE - 1 = 19 // bytes. The first byte is the database type, // e.g. GEOIP_COUNTRY_EDITION = 1 or GEOIP_COUNTRY_EDITION_V6 = 12, // possibly with 105 added to it. Type-specific information then // follows. There is no type-specific information for the country // editions. // // The GeoLite Country IPv4 database downloadable from MaxMind does // not include this structure-info block. namespace { class binary_trie { public: typedef uint_fast32_t edge_type; explicit binary_trie(edge_type leaf); void set_range( const uint8_t range_min[], const uint8_t range_max[], std::size_t bit_count, edge_type leaf); void reorder_depth_first(); void reorder_in_blocks(std::size_t bytes_per_block); void write_binary(std::ostream &dat_stream) const; void write_segment(std::ostream &dat_stream) const; void update_records(); private: struct node { edge_type edges[2]; }; std::vector nodes; // This could be std::vector but that seems slower. typedef std::vector bits_vector; void set_range_in_node( const bits_vector *min_bits, const bits_vector *max_bits, std::size_t bit_pos, edge_type edit_node, edge_type leaf); void set_range_in_edge( const bits_vector *min_bits, const bits_vector *max_bits, std::size_t bit_pos, edge_type edit_node, bool bit, edge_type leaf); void reorder( const std::vector &old_to_new, const std::vector &new_to_old); }; } /** Construct a binary trie and its root node. * * \param leaf * Both edges of the root node will initially point to this leaf. * The caller should provide a value that means nothing was found. */ binary_trie::binary_trie(edge_type leaf) { const node node = {{ leaf, leaf }}; nodes.push_back(node); } /** Edit the trie so it maps a range of bit sequences to the same * leaf. * * \param range_min * The first bit sequence in the range. Eight bits are packed in each * byte. The most significant bit of the whole sequence is in the * most significant bit of the first byte. * * \param range_max * The last bit sequence in the range. * * \param bit_count * The number of bits in both sequences. * * \param leaf * The leaf to which all the bit sequences in the range should be * mapped. */ void binary_trie::set_range( const uint8_t range_min[], const uint8_t range_max[], std::size_t bit_count, edge_type leaf) { bits_vector min_bits(bit_count); bits_vector max_bits(bit_count); for (std::size_t i = 0; i < bit_count; ++i) { std::size_t byte_pos = i / 8; uint8_t mask = 1 << ((~i) % 8); min_bits[i] = ((range_min[byte_pos] & mask) != 0); max_bits[i] = ((range_max[byte_pos] & mask) != 0); } set_range_in_node(&min_bits, &max_bits, 0, 0, leaf); } /** Edit a node in the trie so it maps a range of bit sequences to the * same leaf. * * \param min_bits * The first bit sequence in the range, or NULL if unbounded. * * \param max_bits * The last bit sequence in the range, or NULL if unbounded. * * \param bit_pos * Which bit in the sequences corresponds to \a edit_node. * * \param edit_node * The node to be modified. * * \param leaf * The leaf to which all the bit sequences in the range should be * mapped. */ void binary_trie::set_range_in_node( const bits_vector *min_bits, const bits_vector *max_bits, std::size_t bit_pos, edge_type edit_node, edge_type leaf) { if (!min_bits || (*min_bits)[bit_pos] == false) { set_range_in_edge(min_bits, (max_bits && (*max_bits)[bit_pos] == false) ? max_bits : NULL, bit_pos + 1, edit_node, false, leaf); } if (!max_bits || (*max_bits)[bit_pos] == true) { set_range_in_edge((min_bits && (*min_bits)[bit_pos] == true) ? min_bits : NULL, max_bits, bit_pos + 1, edit_node, true, leaf); } } /** Edit an edge in the trie so it maps a range of bit sequences to * the same leaf. * * \param min_bits * The first bit sequence in the range, or NULL if unbounded. * * \param max_bits * The last bit sequence in the range, or NULL if unbounded. * * \param bit_pos * Which bit in the sequences corresponds to \a bit. * * \param edit_node * The node in which the edge to be modified is located. * * \param bit * Which edge of \a edit_node should be modified. * * \param leaf * The leaf to which all the bit sequences in the range should be * mapped. */ void binary_trie::set_range_in_edge( const bits_vector *min_bits, const bits_vector *max_bits, std::size_t bit_pos, edge_type edit_node, bool bit, edge_type leaf) { // Check if the range fills this edge entirely. bool entire = true; if (min_bits && std::find(min_bits->begin() + bit_pos, min_bits->end(), true) != min_bits->end()) entire = false; if (max_bits && std::find(max_bits->begin() + bit_pos, max_bits->end(), false) != max_bits->end()) entire = false; if (entire) { nodes[edit_node].edges[bit] = leaf; } else { edge_type next = nodes[edit_node].edges[bit]; if (next >= nodes.size()) { const node new_node = {{ next, next }}; next = nodes.size(); nodes.push_back(new_node); nodes[edit_node].edges[bit] = next; } set_range_in_node(min_bits, max_bits, bit_pos, next, leaf); } } /** Renumber the nodes in depth-first order. */ void binary_trie::reorder_depth_first() { std::vector old_to_new, new_to_old; std::stack depth_first; old_to_new.resize(nodes.size(), -1); new_to_old.reserve(nodes.size()); depth_first.push(0); while (!depth_first.empty()) { const edge_type edge = depth_first.top(); depth_first.pop(); if (edge < nodes.size()) { old_to_new[edge] = new_to_old.size(); new_to_old.push_back(edge); depth_first.push(nodes[edge].edges[1]); depth_first.push(nodes[edge].edges[0]); } } reorder(old_to_new, new_to_old); } /** Renumber the nodes to make lookups use CPU and disk caches more * effectively. * * First group the nodes into blocks so that each block contains the * root of a subtrie and as many levels of its descendants as will * fit. This way, after the root is paged in, the next few lookup * steps need not page in anything else. Then, sort the nodes of each * block in depth-first order. That should give each lookup almost * 1/2 chance to find the next node immediately adjacent. * * With a block size of 1024 bytes, this renumbering reduces the time * required for random lookups by about 1.1%, compared to a plain * depth-first order. However, it's still 2.3% slower than the * database optimized by MaxMind. */ void binary_trie::reorder_in_blocks( std::size_t bytes_per_block) { const edge_type none = -1; std::vector old_to_new, new_to_old; ssize_t bytes_left = bytes_per_block; old_to_new.resize(nodes.size(), none); new_to_old.reserve(nodes.size()); for (edge_type subtrie = 0; subtrie < nodes.size(); ++subtrie) { // If subtrie has already been added to the output, // ignore it. if (old_to_new[subtrie] != none) continue; // Walk breadth-first from subtrie until we have a // block full of nodes or the subtrie runs out. Don't // add these nodes immediately to the output, however. // Instead just list them in nodes_in_block. std::set nodes_in_block; std::queue breadth_first; breadth_first.push(subtrie); if (bytes_left <= 0) bytes_left += bytes_per_block; while (bytes_left > 0 && !breadth_first.empty()) { edge_type edge = breadth_first.front(); breadth_first.pop(); if (edge >= nodes.size()) continue; // Let the last node of the block straddle the // block boundary. That's better than making // the hotter first node do so. bytes_left -= 6; nodes_in_block.insert(edge); breadth_first.push(nodes[edge].edges[0]); breadth_first.push(nodes[edge].edges[1]); } // Add the nodes from nodes_in_block to the output in // depth-first order. This assumes they are all // reachable from subtrie. std::stack depth_first; depth_first.push(subtrie); while (!depth_first.empty()) { edge_type edge = depth_first.top(); depth_first.pop(); if (nodes_in_block.find(edge) == nodes_in_block.end()) continue; old_to_new[edge] = new_to_old.size(); new_to_old.push_back(edge); depth_first.push(nodes[edge].edges[1]); depth_first.push(nodes[edge].edges[0]); } } reorder(old_to_new, new_to_old); } void binary_trie::reorder( const std::vector &old_to_new, const std::vector &new_to_old) { std::vector new_nodes; new_nodes.reserve(new_to_old.size()); for (std::vector::const_iterator it = new_to_old.begin(); it != new_to_old.end(); ++it) { node new_node; for (int bit = 0; bit <= 1; ++bit) { edge_type old_edge = nodes[*it].edges[bit]; if (old_edge < nodes.size()) new_node.edges[bit] = old_to_new[old_edge]; else new_node.edges[bit] = old_edge; } new_nodes.push_back(new_node); } swap(new_nodes, nodes); } /** Add the size of the trie (number of nodes) to data records*/ void binary_trie::update_records() { for (std::vector::iterator it = nodes.begin(); it != nodes.end(); ++it) { // previously, we commandeered the MSB in order to indicate which records // were data records, rather than pointers to other nodes in the trie. // Here, we remove that bit, and increment the record by the number of nodes, // because this is how libGeoIP determines whether a node points to an entry // inside the data section or another node. for (int i = 0;i<2;++i) { if (it->edges[i] == 0x80000000) it->edges[i] = nodes.size(); else if (it->edges[i] & 0x80000000) { it->edges[i] = (it->edges[i] & 0x7FFFFFFF) + nodes.size(); } } } } /** Write the 3 byte segment offset. **/ void binary_trie::write_segment(std::ostream &dat_stream) const { int len = nodes.size(); dat_stream << (char) (0xFF & len); dat_stream << (char) (0xFF & (len >> 8)); dat_stream << (char) (0xFF & (len >> 16)); } /** Write the trie to a stream in GeoIP binary format. */ void binary_trie::write_binary(std::ostream &dat_stream) const { for (std::vector::const_iterator it = nodes.begin(); it != nodes.end(); ++it) { union { uint8_t bytes[6]; char chars[6]; } binary = {{ (it->edges[0] ) & 0xFF, (it->edges[0] >> 8) & 0xFF, (it->edges[0] >> 16) & 0xFF, (it->edges[1] ) & 0xFF, (it->edges[1] >> 8) & 0xFF, (it->edges[1] >> 16) & 0xFF }}; dat_stream.write(binary.chars, 6); if (dat_stream.bad()) return; } } namespace { void v4_csv_line_to_vector( const std::string line, std::vector &fields) { std::string buf(line); std::string delim = ","; std::size_t fs; for(int i = 0; i<2;++i) { fs = buf.find(delim); fields.push_back(buf.substr(0,fs)); buf.erase(0,fs + delim.length()); } if (buf[0] == '"' || buf[0] == '\'') fields.push_back(buf.substr(1, buf.length() - 2)); else fields.push_back(buf); } void v6_csv_line_to_vector( const std::string line, std::vector & fields) { std::string buf(line); std::string delim = ","; std::size_t fs; for(int i = 0; i<3;++i) { fs = buf.rfind(delim); fields.push_back(buf.substr(fs+delim.length(), buf.length())); buf.erase(fs,buf.length()); } if (buf[0] == '"' || buf[0] == '\'') fields.push_back(buf.substr(1, buf.length() - 2)); else fields.push_back(buf.substr(0, buf.length())); } /** Load ranges of IP addresses from a CSV-formatted stream to * a trie. * * \param trie * Load the ranges to this trie, overwriting original values. * * \param csv_file_name * The name of the file that \a csv_stream is reading. * This string is used only for error messages. * * \param csv_stream * Load the ranges from this stream. * * \param address_family * The type of IP addresses in the CSV data: either AF_INET * for IPv4 or AF_INET6 for IPv6. */ void csv_stream_to_trie_db( binary_trie &trie, std::string &database_segment, const char *csv_file_name, std::istream &csv_stream, int address_family) { enum { V4_CSV_FIELD_MIN_DECIMAL, V4_CSV_FIELD_MAX_DECIMAL, V4_CSV_FIELD_ASNUM_DESCRIPTION, V4_CSV_FIELDS }; enum { V6_CSV_FIELD_NET_BITS, V6_CSV_FIELD_MAX_TEXT, V6_CSV_FIELD_MIN_TEXT, V6_CSV_FIELD_ASNUM_DESCRIPTION, V6_CSV_FIELDS }; std::string csv_line; std::map segment_offset; std::vector csv_fields; // create a map to track which as descriptions are added already int csv_line_number = 0; database_segment += '\0'; // padding so that record 0 is not at the start of the db. while (getline(csv_stream, csv_line)) { ++csv_line_number; std::string as; switch (address_family) { case AF_INET: v4_csv_line_to_vector(csv_line, csv_fields); if (csv_fields.size() != V4_CSV_FIELDS) { error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, "Wrong number of fields"); } as = csv_fields[V4_CSV_FIELD_ASNUM_DESCRIPTION]; if (segment_offset.find(as) == segment_offset.end()) { // no key found segment_offset[as] = database_segment.length(); // start of record database_segment += csv_fields[V4_CSV_FIELD_ASNUM_DESCRIPTION] + '\x00'; } break; case AF_INET6: v6_csv_line_to_vector(csv_line, csv_fields); if (csv_fields.size() != V6_CSV_FIELDS) { error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, "Wrong number of fields"); } as = csv_fields[V6_CSV_FIELD_ASNUM_DESCRIPTION]; if (segment_offset.find(as) == segment_offset.end()) { // no key found segment_offset[as] = database_segment.length(); // start of record database_segment += csv_fields[V6_CSV_FIELD_ASNUM_DESCRIPTION] + '\x00'; } break; default: abort(); } // use the MSB to indicate that this is a data record. Later, the field // is set to the segment offset + the nubmer of nodes in the trie. const binary_trie::edge_type leaf = 0x80000000 | segment_offset[as]; union { struct in_addr inet; uint8_t inetbytes[4]; struct in6_addr inet6; } minaddr, maxaddr; switch (address_family) { case AF_INET: inet_aton(csv_fields[V4_CSV_FIELD_MIN_DECIMAL].c_str(), &(minaddr.inet)); inet_aton(csv_fields[V4_CSV_FIELD_MAX_DECIMAL].c_str(), &(maxaddr.inet)); trie.set_range(minaddr.inetbytes, maxaddr.inetbytes, 32, leaf); break; case AF_INET6: if (inet_pton(address_family, csv_fields[V6_CSV_FIELD_MIN_TEXT].c_str(), &minaddr) <= 0) { error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, "Cannot parse minimum address: %s", csv_fields[V6_CSV_FIELD_MIN_TEXT].c_str()); } if (inet_pton(address_family, csv_fields[V6_CSV_FIELD_MAX_TEXT].c_str(), &maxaddr) <= 0) { error_at_line(EX_DATAERR, 0, csv_file_name, csv_line_number, "Cannot parse maximum address: %s", csv_fields[V6_CSV_FIELD_MAX_TEXT].c_str()); } trie.set_range(minaddr.inet6.s6_addr, maxaddr.inet6.s6_addr, 128, leaf); break; default: abort(); } csv_fields.clear(); } if (csv_stream.bad()) { error(EX_IOERR, errno, "%s", csv_file_name); } } /** Load ranges of IP addresses from a CSV-formatted file or * standard input to a trie. * * \param trie * Load the ranges to this trie, overwriting original values. * * \param csv_file_name * The name of the CSV file that should be read, or "-" for * standard input. * * \param address_family * The type of IP addresses in the CSV data: either AF_INET * for IPv4 or AF_INET6 for IPv6. */ void csv_file_to_trie_db( binary_trie &trie, std::string &database_segment, const char *csv_file_name, int address_family) { if (std::strcmp(csv_file_name, "-") == 0) { csv_stream_to_trie_db(trie, database_segment, csv_file_name, std::cin, address_family); } else { std::ifstream csv_stream(csv_file_name, std::ios::in); if (!csv_stream) { error(EX_NOINPUT, errno, "%s", csv_file_name); } csv_stream_to_trie_db(trie, database_segment, csv_file_name, csv_stream, address_family); } } /** Write a GeoIP binary database to a stream. * * \param trie * Mapping from IP addresses to country codes or other values. * * \param dat_file_name * The name of the file that \a dat_stream is writing. * This string is used only for error messages. * * \param dat_stream * Write the database to this stream. * * \param database_info * Copyright or other information about the database, or NULL. * GeoIP_database_info() will return this string. * * \param address_family * The type of IP addresses in the database: either AF_INET * for IPv4 or AF_INET6 for IPv6. */ void write_dat_stream( const binary_trie &trie, const char *dat_file_name, std::ostream &dat_stream, const char *database_info, std::string database_segment, int address_family) { trie.write_binary(dat_stream); if (dat_stream.bad()) { error(EX_IOERR, errno, "%s", dat_file_name); } // or open the file and read the length if (database_segment.length() > 0) { dat_stream << database_segment; if (dat_stream.bad()) { error(EX_IOERR, errno, "%s", dat_file_name); } } // write the metadata section if (database_info) { const char tag[3] = { 0, 0, 0 }; dat_stream.write(tag, 3); dat_stream.write(database_info, std::strlen(database_info)); if (dat_stream.bad()) { error(EX_IOERR, errno, "%s", dat_file_name); } } switch (address_family) { case AF_INET: { const unsigned char structure_info[4] = { 0xFF, 0xFF, 0xFF, 9 }; dat_stream.write((const char *)structure_info, 4); break; } case AF_INET6: { const unsigned char structure_info[4] = { 0xFF, 0xFF, 0xFF, 21 }; dat_stream.write((const char *)structure_info, 4); break; } default: abort(); } trie.write_segment(dat_stream); if (dat_stream.bad()) { error(EX_IOERR, errno, "%s", dat_file_name); } } /** Write a GeoIP binary database to a file or standard output. * * \param trie * Mapping from IP addresses to country codes or other values. * * \param csv_file_name * The name of the file that should be written, or "-" for * standard output. * * \param database_info * Copyright or other information about the database, or NULL. * GeoIP_database_info() will return this string. * * \param address_family * The type of IP addresses in the database: either AF_INET * for IPv4 or AF_INET6 for IPv6. */ void write_dat_file( const binary_trie &trie, const char *dat_file_name, const char *database_info, std::string database_segment, int address_family) { if (std::strcmp(dat_file_name, "-") == 0) { write_dat_stream(trie, dat_file_name, std::cout, database_info, database_segment, address_family); } else { std::ofstream dat_stream( dat_file_name, std::ios::out | std::ios::binary); if (!dat_stream) { error(EX_CANTCREAT, errno, "%s", dat_file_name); } write_dat_stream(trie, dat_file_name, dat_stream, database_info, database_segment, address_family); } } struct cmdline { const char *csv_file_name; const char *dat_file_name; int address_family; const char *database_info; bool verbose; cmdline(int argc, char **argv); }; } cmdline::cmdline(int argc, char **argv): csv_file_name("-"), dat_file_name("-"), address_family(AF_INET), database_info(NULL), verbose(false) { enum { OPT_HELP = -2 }; static const struct option long_options[] = { { "info", required_argument, NULL, 'i' }, { "output", required_argument, NULL, 'o' }, { "verbose", no_argument, NULL, 'v' }, { "help", no_argument, NULL, OPT_HELP }, { NULL, 0, NULL, 0 } }; static const char *const usage = "\ Usage: %s [OPTION] [CSV-FILE]...\n\ Convert a country database from CSV to GeoIP binary format.\n\ \n\ -i, --info=TEXT add copyright or other info TEXT to output\n\ -o, --output=FILE write the binary data to FILE, not stdout\n\ -v, --verbose show what is going on\n\ --help display this help and exit\n"; for (;;) { int optret = getopt_long(argc, argv, "46i:o:v", long_options, NULL); if (optret == -1) break; switch (optret) { case '4': address_family = AF_INET; break; case '6': address_family = AF_INET6; break; case 'i': database_info = optarg; if (std::strlen(database_info) > 99) { error(EX_USAGE, 0, "Database info must not be longer than 99 bytes"); } break; case 'o': dat_file_name = optarg; break; case 'v': verbose = true; break; case OPT_HELP: std::printf(usage, program_invocation_name); std::exit(EX_OK); case '?': std::fprintf(stderr, "Try `%s --help' for more information.\n", program_invocation_name); std::exit(EX_USAGE); default: std::abort(); } } if (optind < argc) csv_file_name = argv[optind++]; if (optind < argc) { error(EX_USAGE, 0, "Only one non-option argument is allowed"); } } int main(int argc, char **argv) { cmdline cmdline(argc, argv); std::ostream *verbose_stream; if (!cmdline.verbose) verbose_stream = NULL; else if (strcmp(cmdline.dat_file_name, "-") == 0) verbose_stream = &std::cerr; else verbose_stream = &std::cout; if (verbose_stream) { *verbose_stream << program_invocation_name << ": Reading CSV and building the trie" << std::endl; } /* Initialize the trie with a value that will point to the start of the * data section, e.g. an empty record. See binary_trie::update_records() */ binary_trie trie(0x80000000); std::string database_segment; csv_file_to_trie_db(trie, database_segment, cmdline.csv_file_name, cmdline.address_family); if (verbose_stream) { *verbose_stream << program_invocation_name << ": Optimizing" << std::endl; } trie.reorder_depth_first(); trie.reorder_in_blocks(1024); trie.update_records(); if (verbose_stream) { *verbose_stream << program_invocation_name << ": Writing output" << std::endl; } write_dat_file(trie, cmdline.dat_file_name, cmdline.database_info, database_segment, cmdline.address_family); if (verbose_stream) { *verbose_stream << program_invocation_name << ": All done" << std::endl; } }