diff options
Diffstat (limited to '')
-rw-r--r-- | toolkit/components/translation/cld2/internal/tote.cc | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/toolkit/components/translation/cld2/internal/tote.cc b/toolkit/components/translation/cld2/internal/tote.cc new file mode 100644 index 0000000000..fbaba7d5cb --- /dev/null +++ b/toolkit/components/translation/cld2/internal/tote.cc @@ -0,0 +1,265 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Author: dsites@google.com (Dick Sites) +// + +#include "tote.h" +#include "lang_script.h" // For LanguageCode in Dump + +#include <stdio.h> +#include <string.h> // For memset + +namespace CLD2 { + +// Take a set of <key, value> pairs and tote them up. +// After explicitly sorting, retrieve top key, value pairs +// Normal use is key=per-script language and value = probability score +Tote::Tote() { + in_use_mask_ = 0; + byte_count_ = 0; + score_count_ = 0; + // No need to initialize values +} + +Tote::~Tote() { +} + +void Tote::Reinit() { + in_use_mask_ = 0; + byte_count_ = 0; + score_count_ = 0; + // No need to initialize values +} +// Increment count of quadgrams/trigrams/unigrams scored +void Tote::AddScoreCount() { + ++score_count_; +} + + +void Tote::Add(uint8 ikey, int idelta) { + int key_group = ikey >> 2; + uint64 groupmask = (1ULL << key_group); + if ((in_use_mask_ & groupmask) == 0) { + // Initialize this group + gscore_[key_group] = 0; + in_use_mask_ |= groupmask; + } + score_[ikey] += idelta; +} + + +// Return current top three keys +void Tote::CurrentTopThreeKeys(int* key3) const { + key3[0] = -1; + key3[1] = -1; + key3[2] = -1; + int score3[3] = {-1, -1, -1}; + uint64 tempmask = in_use_mask_; + int base = 0; + while (tempmask != 0) { + if (tempmask & 1) { + // Look at four in-use keys + for (int i = 0; i < 4; ++i) { + int insert_me = score_[base + i]; + // Favor lower numbers on ties + if (insert_me > score3[2]) { + // Insert + int insert_at = 2; + if (insert_me > score3[1]) { + score3[2] = score3[1]; + key3[2] = key3[1]; + insert_at = 1; + if (insert_me > score3[0]) { + score3[1] = score3[0]; + key3[1] = key3[0]; + insert_at = 0; + } + } + score3[insert_at] = insert_me; + key3[insert_at] = base + i; + } + } + } + tempmask >>= 1; + base += 4; + } +} + + +// Take a set of <key, value> pairs and tote them up. +// After explicitly sorting, retrieve top key, value pairs +// 0xFFFF in key signifies unused +DocTote::DocTote() { + // No need to initialize score_ or value_ + incr_count_ = 0; + sorted_ = 0; + memset(closepair_, 0, sizeof(closepair_)); + memset(key_, 0xFF, sizeof(key_)); +} + +DocTote::~DocTote() { +} + +void DocTote::Reinit() { + // No need to initialize score_ or value_ + incr_count_ = 0; + sorted_ = 0; + memset(closepair_, 0, sizeof(closepair_)); + memset(key_, 0xFF, sizeof(key_)); + runningscore_.Reinit(); +} + +// Weight reliability by ibytes +// Also see three-way associative comments above for Tote +void DocTote::Add(uint16 ikey, int ibytes, + int score, int ireliability) { + ++incr_count_; + + // Look for existing entry in top 2 positions of 3, times 8 columns + int sub0 = ikey & 15; + if (key_[sub0] == ikey) { + value_[sub0] += ibytes; + score_[sub0] += score; + reliability_[sub0] += ireliability * ibytes; + return; + } + // Look for existing entry in other of top 2 positions of 3, times 8 columns + int sub1 = sub0 ^ 8; + if (key_[sub1] == ikey) { + value_[sub1] += ibytes; + score_[sub1] += score; + reliability_[sub1] += ireliability * ibytes; + return; + } + // Look for existing entry in third position of 3, times 8 columns + int sub2 = (ikey & 7) + 16; + if (key_[sub2] == ikey) { + value_[sub2] += ibytes; + score_[sub2] += score; + reliability_[sub2] += ireliability * ibytes; + return; + } + + // Allocate new entry + int alloc = -1; + if (key_[sub0] == kUnusedKey) { + alloc = sub0; + } else if (key_[sub1] == kUnusedKey) { + alloc = sub1; + } else if (key_[sub2] == kUnusedKey) { + alloc = sub2; + } else { + // All choices allocated, need to replace smallest one + alloc = sub0; + if (value_[sub1] < value_[alloc]) {alloc = sub1;} + if (value_[sub2] < value_[alloc]) {alloc = sub2;} + } + key_[alloc] = ikey; + value_[alloc] = ibytes; + score_[alloc] = score; + reliability_[alloc] = ireliability * ibytes; + return; +} + +// Find subscript of a given packed language, or -1 +int DocTote::Find(uint16 ikey) { + if (sorted_) { + // Linear search if sorted + for (int sub = 0; sub < kMaxSize_; ++sub) { + if (key_[sub] == ikey) {return sub;} + } + return -1; + } + + // Look for existing entry + int sub0 = ikey & 15; + if (key_[sub0] == ikey) { + return sub0; + } + int sub1 = sub0 ^ 8; + if (key_[sub1] == ikey) { + return sub1; + } + int sub2 = (ikey & 7) + 16; + if (key_[sub2] == ikey) { + return sub2; + } + + return -1; +} + +// Return current top key +int DocTote::CurrentTopKey() { + int top_key = 0; + int top_value = -1; + for (int sub = 0; sub < kMaxSize_; ++sub) { + if (key_[sub] == kUnusedKey) {continue;} + if (top_value < value_[sub]) { + top_value = value_[sub]; + top_key = key_[sub]; + } + } + return top_key; +} + + +// Sort first n entries by decreasing order of value +// If key==0 other fields are not valid, treat value as -1 +void DocTote::Sort(int n) { + // This is n**2, but n is small + for (int sub = 0; sub < n; ++sub) { + if (key_[sub] == kUnusedKey) {value_[sub] = -1;} + + // Bubble sort key[sub] and entry[sub] + for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { + if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;} + if (value_[sub] < value_[sub2]) { + // swap + uint16 tmpk = key_[sub]; + key_[sub] = key_[sub2]; + key_[sub2] = tmpk; + + int tmpv = value_[sub]; + value_[sub] = value_[sub2]; + value_[sub2] = tmpv; + + double tmps = score_[sub]; + score_[sub] = score_[sub2]; + score_[sub2] = tmps; + + int tmpr = reliability_[sub]; + reliability_[sub] = reliability_[sub2]; + reliability_[sub2] = tmpr; + } + } + } + sorted_ = 1; +} + +void DocTote::Dump(FILE* f) { + fprintf(f, "DocTote::Dump\n"); + for (int sub = 0; sub < kMaxSize_; ++sub) { + if (key_[sub] != kUnusedKey) { + Language lang = static_cast<Language>(key_[sub]); + fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang), + value_[sub], score_[sub], reliability_[sub]); + } + } + fprintf(f, " %d chunks scored<br>\n", incr_count_); +} + +} // End namespace CLD2 + |