// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // #include "tote.h" #include "lang_script.h" // For LanguageCode in Dump #include #include // For memset namespace CLD2 { // Take a set of pairs and tote them up. // After explicitly sorting, retrieve top key, value pairs // Normal use is key=per-script language and value = probability score Tote::Tote() { in_use_mask_ = 0; byte_count_ = 0; score_count_ = 0; // No need to initialize values } Tote::~Tote() { } void Tote::Reinit() { in_use_mask_ = 0; byte_count_ = 0; score_count_ = 0; // No need to initialize values } // Increment count of quadgrams/trigrams/unigrams scored void Tote::AddScoreCount() { ++score_count_; } void Tote::Add(uint8 ikey, int idelta) { int key_group = ikey >> 2; uint64 groupmask = (1ULL << key_group); if ((in_use_mask_ & groupmask) == 0) { // Initialize this group gscore_[key_group] = 0; in_use_mask_ |= groupmask; } score_[ikey] += idelta; } // Return current top three keys void Tote::CurrentTopThreeKeys(int* key3) const { key3[0] = -1; key3[1] = -1; key3[2] = -1; int score3[3] = {-1, -1, -1}; uint64 tempmask = in_use_mask_; int base = 0; while (tempmask != 0) { if (tempmask & 1) { // Look at four in-use keys for (int i = 0; i < 4; ++i) { int insert_me = score_[base + i]; // Favor lower numbers on ties if (insert_me > score3[2]) { // Insert int insert_at = 2; if (insert_me > score3[1]) { score3[2] = score3[1]; key3[2] = key3[1]; insert_at = 1; if (insert_me > score3[0]) { score3[1] = score3[0]; key3[1] = key3[0]; insert_at = 0; } } score3[insert_at] = insert_me; key3[insert_at] = base + i; } } } tempmask >>= 1; base += 4; } } // Take a set of pairs and tote them up. // After explicitly sorting, retrieve top key, value pairs // 0xFFFF in key signifies unused DocTote::DocTote() { // No need to initialize score_ or value_ incr_count_ = 0; sorted_ = 0; memset(closepair_, 0, sizeof(closepair_)); memset(key_, 0xFF, sizeof(key_)); } DocTote::~DocTote() { } void DocTote::Reinit() { // No need to initialize score_ or value_ incr_count_ = 0; sorted_ = 0; memset(closepair_, 0, sizeof(closepair_)); memset(key_, 0xFF, sizeof(key_)); runningscore_.Reinit(); } // Weight reliability by ibytes // Also see three-way associative comments above for Tote void DocTote::Add(uint16 ikey, int ibytes, int score, int ireliability) { ++incr_count_; // Look for existing entry in top 2 positions of 3, times 8 columns int sub0 = ikey & 15; if (key_[sub0] == ikey) { value_[sub0] += ibytes; score_[sub0] += score; reliability_[sub0] += ireliability * ibytes; return; } // Look for existing entry in other of top 2 positions of 3, times 8 columns int sub1 = sub0 ^ 8; if (key_[sub1] == ikey) { value_[sub1] += ibytes; score_[sub1] += score; reliability_[sub1] += ireliability * ibytes; return; } // Look for existing entry in third position of 3, times 8 columns int sub2 = (ikey & 7) + 16; if (key_[sub2] == ikey) { value_[sub2] += ibytes; score_[sub2] += score; reliability_[sub2] += ireliability * ibytes; return; } // Allocate new entry int alloc = -1; if (key_[sub0] == kUnusedKey) { alloc = sub0; } else if (key_[sub1] == kUnusedKey) { alloc = sub1; } else if (key_[sub2] == kUnusedKey) { alloc = sub2; } else { // All choices allocated, need to replace smallest one alloc = sub0; if (value_[sub1] < value_[alloc]) {alloc = sub1;} if (value_[sub2] < value_[alloc]) {alloc = sub2;} } key_[alloc] = ikey; value_[alloc] = ibytes; score_[alloc] = score; reliability_[alloc] = ireliability * ibytes; return; } // Find subscript of a given packed language, or -1 int DocTote::Find(uint16 ikey) { if (sorted_) { // Linear search if sorted for (int sub = 0; sub < kMaxSize_; ++sub) { if (key_[sub] == ikey) {return sub;} } return -1; } // Look for existing entry int sub0 = ikey & 15; if (key_[sub0] == ikey) { return sub0; } int sub1 = sub0 ^ 8; if (key_[sub1] == ikey) { return sub1; } int sub2 = (ikey & 7) + 16; if (key_[sub2] == ikey) { return sub2; } return -1; } // Return current top key int DocTote::CurrentTopKey() { int top_key = 0; int top_value = -1; for (int sub = 0; sub < kMaxSize_; ++sub) { if (key_[sub] == kUnusedKey) {continue;} if (top_value < value_[sub]) { top_value = value_[sub]; top_key = key_[sub]; } } return top_key; } // Sort first n entries by decreasing order of value // If key==0 other fields are not valid, treat value as -1 void DocTote::Sort(int n) { // This is n**2, but n is small for (int sub = 0; sub < n; ++sub) { if (key_[sub] == kUnusedKey) {value_[sub] = -1;} // Bubble sort key[sub] and entry[sub] for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} if (value_[sub] < value_[sub2]) { // swap uint16 tmpk = key_[sub]; key_[sub] = key_[sub2]; key_[sub2] = tmpk; int tmpv = value_[sub]; value_[sub] = value_[sub2]; value_[sub2] = tmpv; double tmps = score_[sub]; score_[sub] = score_[sub2]; score_[sub2] = tmps; int tmpr = reliability_[sub]; reliability_[sub] = reliability_[sub2]; reliability_[sub2] = tmpr; } } } sorted_ = 1; } void DocTote::Dump(FILE* f) { fprintf(f, "DocTote::Dump\n"); for (int sub = 0; sub < kMaxSize_; ++sub) { if (key_[sub] != kUnusedKey) { Language lang = static_cast(key_[sub]); fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), value_[sub], score_[sub], reliability_[sub]); } } fprintf(f, " %d chunks scored
\n", incr_count_); } } // End namespace CLD2