760 lines
28 KiB
C++
760 lines
28 KiB
C++
// Copyright 2010-2025 Google LLC
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "ortools/set_cover/set_cover_model.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
#include <numeric>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "absl/log/check.h"
|
|
#include "absl/log/log.h"
|
|
#include "absl/numeric/bits.h"
|
|
#include "absl/random/discrete_distribution.h"
|
|
#include "absl/random/distributions.h"
|
|
#include "absl/random/random.h"
|
|
#include "absl/strings/str_format.h"
|
|
#include "absl/strings/str_join.h"
|
|
#include "absl/strings/string_view.h"
|
|
#include "absl/types/span.h"
|
|
#include "ortools/algorithms/radix_sort.h"
|
|
#include "ortools/set_cover/base_types.h"
|
|
#include "ortools/set_cover/set_cover.pb.h"
|
|
|
|
namespace operations_research {
|
|
|
|
namespace {
|
|
|
|
// Returns a value in [min, min + scaling_factor * (raw_value - min +
|
|
// random_term)], where raw_value is drawn from a discrete distribution, and
|
|
// random_term is a double drawn uniformly in [0, 1].
|
|
BaseInt DiscreteAffine(absl::BitGen& bitgen,
|
|
absl::discrete_distribution<BaseInt>& dist, BaseInt min,
|
|
double scaling_factor) {
|
|
const BaseInt raw_value = dist(bitgen);
|
|
const double random_term = absl::Uniform<double>(bitgen, 0, 1.0);
|
|
const BaseInt affine_value =
|
|
static_cast<BaseInt>(
|
|
std::floor((raw_value - min + random_term) * scaling_factor)) +
|
|
min;
|
|
return affine_value;
|
|
}
|
|
|
|
// For a given view (SparseColumnView or SparseRowView), returns the
|
|
// distribution of the sizes of the vectors in the view, which can be used in
|
|
// an absl::discrete_distribution.
|
|
template <typename View>
|
|
std::tuple<BaseInt, std::vector<BaseInt>> ComputeSizeHistogram(
|
|
const View& view) {
|
|
BaseInt max_size = 0;
|
|
BaseInt min_size = std::numeric_limits<BaseInt>::max();
|
|
for (const auto& vec : view) {
|
|
const BaseInt size = vec.size();
|
|
min_size = std::min(min_size, size);
|
|
max_size = std::max(max_size, size);
|
|
}
|
|
std::vector<BaseInt> weights(max_size + 1, 0);
|
|
for (const auto& vec : view) {
|
|
const BaseInt size = vec.size();
|
|
++weights[size];
|
|
}
|
|
return {min_size, weights};
|
|
}
|
|
|
|
template <typename View>
|
|
std::tuple<BaseInt, absl::discrete_distribution<BaseInt>>
|
|
ComputeSizeDistribution(const View& view) {
|
|
const auto [min_size, weights] = ComputeSizeHistogram(view);
|
|
absl::discrete_distribution<BaseInt> dist(weights.begin(), weights.end());
|
|
return {min_size, dist};
|
|
}
|
|
} // namespace
|
|
|
|
SetCoverModel SetCoverModel::GenerateRandomModelFrom(
|
|
const SetCoverModel& seed_model, BaseInt num_elements, BaseInt num_subsets,
|
|
double row_scale, double column_scale, double cost_scale) {
|
|
SetCoverModel model;
|
|
StopWatch stop_watch(&(model.generation_duration_));
|
|
DCHECK_GT(row_scale, 0.0);
|
|
DCHECK_GT(column_scale, 0.0);
|
|
DCHECK_GT(cost_scale, 0.0);
|
|
model.num_elements_ = num_elements;
|
|
model.num_nonzeros_ = 0;
|
|
model.ResizeNumSubsets(num_subsets);
|
|
model.UpdateAllSubsetsList();
|
|
absl::BitGen bitgen;
|
|
|
|
// Create the distribution of the cardinalities of the subsets based on the
|
|
// histogram of column sizes in the seed model.
|
|
auto [min_column_size, column_dist] =
|
|
ComputeSizeDistribution(seed_model.columns());
|
|
|
|
// Create the distribution of the degrees of the elements based on the
|
|
// histogram of row sizes in the seed model.
|
|
auto [min_row_size, row_dist] = ComputeSizeDistribution(seed_model.rows());
|
|
|
|
// Prepare the degrees of the elements in the generated model, and use them
|
|
// in a distribution to generate the columns. This ponderates the columns
|
|
// towards the elements with higher degrees. ???
|
|
ElementToIntVector degrees(num_elements);
|
|
for (ElementIndex element(0); element.value() < num_elements; ++element) {
|
|
degrees[element] =
|
|
DiscreteAffine(bitgen, row_dist, min_row_size, row_scale);
|
|
}
|
|
absl::discrete_distribution<BaseInt> degree_dist(degrees.begin(),
|
|
degrees.end());
|
|
|
|
// Vector indicating whether the generated model covers an element.
|
|
ElementBoolVector contains_element(num_elements, false);
|
|
|
|
// Number of elements in the generated model, using the above vector.
|
|
BaseInt num_elements_covered(0);
|
|
|
|
// Maximum number of tries to generate a random element that is not yet in
|
|
// the subset, or a random subset that does not contain the element.
|
|
const int kMaxTries = 10;
|
|
|
|
// Loop-local vector indicating whether the currently generated subset
|
|
// contains an element.
|
|
ElementBoolVector subset_already_contains_element(num_elements, false);
|
|
for (SubsetIndex subset(0); subset.value() < num_subsets; ++subset) {
|
|
LOG_EVERY_N_SEC(INFO, 5)
|
|
<< absl::StrFormat("Generating subset %d (%.1f%%)", subset.value(),
|
|
100.0 * subset.value() / num_subsets);
|
|
const BaseInt cardinality =
|
|
DiscreteAffine(bitgen, column_dist, min_column_size, column_scale);
|
|
model.columns_[subset].reserve(cardinality);
|
|
for (BaseInt iter = 0; iter < cardinality; ++iter) {
|
|
int num_tries = 0;
|
|
ElementIndex element;
|
|
// Choose an element that is not yet in the subset at random with a
|
|
// distribution that is proportional to the degree of the element.
|
|
do {
|
|
element = ElementIndex(degree_dist(bitgen));
|
|
CHECK_LT(element.value(), num_elements);
|
|
++num_tries;
|
|
} while (num_tries < kMaxTries &&
|
|
subset_already_contains_element[element]);
|
|
++model.num_nonzeros_;
|
|
model.columns_[subset].push_back(element);
|
|
subset_already_contains_element[element] = true;
|
|
if (!contains_element[element]) {
|
|
contains_element[element] = true;
|
|
++num_elements_covered;
|
|
}
|
|
}
|
|
for (const ElementIndex element : model.columns_[subset]) {
|
|
subset_already_contains_element[element] = false;
|
|
}
|
|
}
|
|
VLOG(1) << "Finished generating the model with " << num_elements_covered
|
|
<< " elements covered.";
|
|
|
|
// It can happen -- rarely in practice -- that some of the elements cannot be
|
|
// covered. Let's add them to randomly chosen subsets.
|
|
if (num_elements_covered != num_elements) {
|
|
VLOG(1) << "Generated model with " << num_elements - num_elements_covered
|
|
<< " elements that cannot be covered. Adding them to random "
|
|
"subsets.";
|
|
SubsetBoolVector element_already_in_subset(num_subsets, false);
|
|
for (ElementIndex element(0); element.value() < num_elements; ++element) {
|
|
VLOG_EVERY_N_SEC(1, 5) << absl::StrFormat(
|
|
"Generating subsets for element %d (%.1f%%)", element.value(),
|
|
100.0 * element.value() / num_elements);
|
|
if (!contains_element[element]) {
|
|
const BaseInt degree =
|
|
DiscreteAffine(bitgen, row_dist, min_row_size, row_scale);
|
|
std::vector<SubsetIndex> generated_subsets;
|
|
generated_subsets.reserve(degree);
|
|
for (BaseInt i = 0; i < degree; ++i) {
|
|
int num_tries = 0;
|
|
SubsetIndex subset_index;
|
|
// The method is the same as above.
|
|
do {
|
|
subset_index = SubsetIndex(DiscreteAffine(
|
|
bitgen, column_dist, min_column_size, column_scale));
|
|
++num_tries;
|
|
} while (num_tries < kMaxTries &&
|
|
element_already_in_subset[subset_index]);
|
|
++model.num_nonzeros_;
|
|
model.columns_[subset_index].push_back(element);
|
|
element_already_in_subset[subset_index] = true;
|
|
generated_subsets.push_back(subset_index);
|
|
}
|
|
for (const SubsetIndex subset_index : generated_subsets) {
|
|
element_already_in_subset[subset_index] = false;
|
|
}
|
|
contains_element[element] = true;
|
|
++num_elements_covered;
|
|
}
|
|
}
|
|
VLOG(1) << "Finished generating subsets for elements that were not "
|
|
"covered in the original model.";
|
|
}
|
|
VLOG(1) << "Finished generating the model. There are "
|
|
<< num_elements - num_elements_covered << " uncovered elements.";
|
|
|
|
CHECK_EQ(num_elements_covered, num_elements);
|
|
|
|
// TODO(user): if necessary, use a better distribution for the costs.
|
|
// The generation of the costs is done in two steps. First, compute the
|
|
// minimum and maximum costs.
|
|
Cost min_cost = std::numeric_limits<Cost>::infinity();
|
|
Cost max_cost = -min_cost;
|
|
for (const Cost cost : seed_model.subset_costs()) {
|
|
min_cost = std::min(min_cost, cost);
|
|
max_cost = std::max(max_cost, cost);
|
|
}
|
|
// Then, generate random numbers in [min_cost, min_cost + cost_range], where
|
|
// cost_range is defined as:
|
|
const Cost cost_range = cost_scale * (max_cost - min_cost);
|
|
for (Cost& cost : model.subset_costs_) {
|
|
cost = min_cost + absl::Uniform<double>(bitgen, 0, cost_range);
|
|
}
|
|
model.CreateSparseRowView();
|
|
return model;
|
|
}
|
|
|
|
void SetCoverModel::UpdateAllSubsetsList() {
|
|
const BaseInt old_size = all_subsets_.size();
|
|
DCHECK_LE(old_size, num_subsets());
|
|
all_subsets_.resize(num_subsets());
|
|
for (BaseInt subset(old_size); subset < num_subsets(); ++subset) {
|
|
all_subsets_[subset] = SubsetIndex(subset);
|
|
}
|
|
}
|
|
|
|
void SetCoverModel::AddEmptySubset(Cost cost) {
|
|
// TODO(user): refine the logic for is_unicost_ and is_unicost_valid_.
|
|
is_unicost_valid_ = false;
|
|
elements_in_subsets_are_sorted_ = false;
|
|
subset_costs_.push_back(cost);
|
|
columns_.push_back(SparseColumn());
|
|
all_subsets_.push_back(SubsetIndex(num_subsets_));
|
|
++num_subsets_;
|
|
CHECK_EQ(columns_.size(), num_subsets());
|
|
CHECK_EQ(subset_costs_.size(), num_subsets());
|
|
CHECK_EQ(all_subsets_.size(), num_subsets());
|
|
row_view_is_valid_ = false;
|
|
}
|
|
|
|
void SetCoverModel::AddElementToLastSubset(BaseInt element) {
|
|
elements_in_subsets_are_sorted_ = false;
|
|
columns_.back().push_back(ElementIndex(element));
|
|
num_elements_ = std::max(num_elements_, element + 1);
|
|
// No need to update the list all_subsets_.
|
|
++num_nonzeros_;
|
|
row_view_is_valid_ = false;
|
|
}
|
|
|
|
void SetCoverModel::AddElementToLastSubset(ElementIndex element) {
|
|
AddElementToLastSubset(element.value());
|
|
}
|
|
|
|
void SetCoverModel::SetSubsetCost(BaseInt subset, Cost cost) {
|
|
// TODO(user): refine the logic for is_unicost_ and is_unicost_valid_.
|
|
// NOMUTANTS -- this is a performance optimization.
|
|
is_unicost_valid_ = false;
|
|
elements_in_subsets_are_sorted_ = false;
|
|
CHECK(std::isfinite(cost));
|
|
DCHECK_GE(subset, 0);
|
|
if (subset >= num_subsets()) {
|
|
num_subsets_ = std::max(num_subsets_, subset + 1);
|
|
columns_.resize(num_subsets_, SparseColumn());
|
|
subset_costs_.resize(num_subsets_, 0.0);
|
|
row_view_is_valid_ = false;
|
|
UpdateAllSubsetsList();
|
|
}
|
|
subset_costs_[SubsetIndex(subset)] = cost;
|
|
}
|
|
|
|
void SetCoverModel::SetSubsetCost(SubsetIndex subset, Cost cost) {
|
|
SetSubsetCost(subset.value(), cost);
|
|
}
|
|
|
|
void SetCoverModel::AddElementToSubset(BaseInt element, BaseInt subset) {
|
|
elements_in_subsets_are_sorted_ = false;
|
|
if (subset >= num_subsets()) {
|
|
num_subsets_ = subset + 1;
|
|
subset_costs_.resize(num_subsets_, 0.0);
|
|
columns_.resize(num_subsets_, SparseColumn());
|
|
UpdateAllSubsetsList();
|
|
}
|
|
columns_[SubsetIndex(subset)].push_back(ElementIndex(element));
|
|
num_elements_ = std::max(num_elements_, element + 1);
|
|
++num_nonzeros_;
|
|
row_view_is_valid_ = false;
|
|
}
|
|
|
|
void SetCoverModel::AddElementToSubset(ElementIndex element,
|
|
SubsetIndex subset) {
|
|
AddElementToSubset(element.value(), subset.value());
|
|
}
|
|
|
|
void SetCoverModel::ResizeNumSubsets(BaseInt num_subsets) {
|
|
num_subsets_ = std::max(num_subsets_, num_subsets);
|
|
columns_.resize(num_subsets_, SparseColumn());
|
|
subset_costs_.resize(num_subsets_, 0.0);
|
|
UpdateAllSubsetsList();
|
|
}
|
|
|
|
void SetCoverModel::ResizeNumSubsets(SubsetIndex num_subsets) {
|
|
ResizeNumSubsets(num_subsets.value());
|
|
}
|
|
|
|
void SetCoverModel::ReserveNumElementsInSubset(BaseInt num_elements,
|
|
BaseInt subset) {
|
|
ResizeNumSubsets(subset);
|
|
columns_[SubsetIndex(subset)].reserve(ColumnEntryIndex(num_elements));
|
|
}
|
|
|
|
void SetCoverModel::SortElementsInSubsets() {
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
// std::sort(columns_[subset].begin(), columns_[subset].end());
|
|
BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
|
|
RadixSort(absl::MakeSpan(data, columns_[subset].size()));
|
|
}
|
|
elements_in_subsets_are_sorted_ = true;
|
|
}
|
|
|
|
void SetCoverModel::CreateSparseRowView() {
|
|
StopWatch stop_watch(&create_sparse_row_view_duration_);
|
|
if (row_view_is_valid_) {
|
|
VLOG(1) << "CreateSparseRowView: already valid";
|
|
return;
|
|
}
|
|
VLOG(1) << "CreateSparseRowView started";
|
|
rows_.resize(num_elements_, SparseRow());
|
|
ElementToIntVector row_sizes(num_elements_, 0);
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
|
|
RadixSort(absl::MakeSpan(data, columns_[subset].size()));
|
|
|
|
ElementIndex preceding_element(-1);
|
|
for (const ElementIndex element : columns_[subset]) {
|
|
CHECK_GT(element, preceding_element)
|
|
<< "Repetition in column "
|
|
<< subset; // Fail if there is a repetition.
|
|
++row_sizes[element];
|
|
preceding_element = element;
|
|
}
|
|
}
|
|
for (const ElementIndex element : ElementRange()) {
|
|
rows_[element].reserve(RowEntryIndex(row_sizes[element]));
|
|
}
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
for (const ElementIndex element : columns_[subset]) {
|
|
rows_[element].emplace_back(subset);
|
|
}
|
|
}
|
|
row_view_is_valid_ = true;
|
|
elements_in_subsets_are_sorted_ = true;
|
|
VLOG(1) << "CreateSparseRowView finished";
|
|
}
|
|
|
|
std::vector<SubsetIndex> SetCoverModel::ComputeSliceIndices(
|
|
int num_partitions) {
|
|
if (num_partitions <= 1 || columns_.empty()) {
|
|
return {SubsetIndex(columns_.size())};
|
|
}
|
|
|
|
std::vector<BaseInt> partial_sum_nnz(columns_.size());
|
|
partial_sum_nnz[0] = columns_[SubsetIndex(0)].size();
|
|
for (BaseInt col = 1; col < columns_.size(); ++col) {
|
|
partial_sum_nnz[col] =
|
|
partial_sum_nnz[col - 1] + columns_[SubsetIndex(col)].size();
|
|
}
|
|
const BaseInt total_nnz = partial_sum_nnz.back();
|
|
const BaseInt target_nnz = (total_nnz + num_partitions - 1) / num_partitions;
|
|
|
|
std::vector<SubsetIndex> partition_points(num_partitions);
|
|
BaseInt threshold = target_nnz;
|
|
BaseInt pos = 0;
|
|
for (const SubsetIndex col : SubsetRange()) {
|
|
if (partial_sum_nnz[col.value()] >= threshold) {
|
|
partition_points[pos] = col;
|
|
++pos;
|
|
threshold += target_nnz;
|
|
}
|
|
}
|
|
partition_points[num_partitions - 1] = SubsetIndex(columns_.size());
|
|
return partition_points;
|
|
}
|
|
|
|
SparseRowView SetCoverModel::ComputeSparseRowViewSlice(SubsetIndex begin_subset,
|
|
SubsetIndex end_subset) {
|
|
SparseRowView rows;
|
|
rows.reserve(num_elements_);
|
|
ElementToIntVector row_sizes(num_elements_, 0);
|
|
for (SubsetIndex subset = begin_subset; subset < end_subset; ++subset) {
|
|
BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
|
|
RadixSort(absl::MakeSpan(data, columns_[subset].size()));
|
|
|
|
ElementIndex preceding_element(-1);
|
|
for (const ElementIndex element : columns_[subset]) {
|
|
CHECK_GT(element, preceding_element)
|
|
<< "Repetition in column "
|
|
<< subset; // Fail if there is a repetition.
|
|
++row_sizes[element];
|
|
preceding_element = element;
|
|
}
|
|
}
|
|
for (const ElementIndex element : ElementRange()) {
|
|
rows[element].reserve(RowEntryIndex(row_sizes[element]));
|
|
}
|
|
for (SubsetIndex subset = begin_subset; subset < end_subset; ++subset) {
|
|
for (const ElementIndex element : columns_[subset]) {
|
|
rows[element].emplace_back(subset);
|
|
}
|
|
}
|
|
return rows;
|
|
}
|
|
|
|
std::vector<SparseRowView> SetCoverModel::CutSparseRowViewInSlices(
|
|
absl::Span<const SubsetIndex> partition_points) {
|
|
std::vector<SparseRowView> row_views;
|
|
row_views.reserve(partition_points.size());
|
|
SubsetIndex begin_subset(0);
|
|
// This should be done in parallel. It is a bottleneck.
|
|
for (SubsetIndex end_subset : partition_points) {
|
|
row_views.push_back(ComputeSparseRowViewSlice(begin_subset, end_subset));
|
|
begin_subset = end_subset;
|
|
}
|
|
return row_views;
|
|
}
|
|
|
|
SparseRowView SetCoverModel::ReduceSparseRowViewSlices(
|
|
absl::Span<const SparseRowView> slices) {
|
|
SparseRowView result_rows;
|
|
// This is not a ReduceTree. This will be done later through parallelism.
|
|
result_rows.reserve(num_elements_);
|
|
ElementToIntVector row_sizes(num_elements_, 0);
|
|
for (const SparseRowView& slice : slices) {
|
|
// This should done as a reduce tree, in parallel.
|
|
for (const ElementIndex element : ElementRange()) {
|
|
result_rows[element].insert(result_rows[element].end(),
|
|
slice[element].begin(), slice[element].end());
|
|
}
|
|
}
|
|
return result_rows;
|
|
}
|
|
|
|
SparseRowView SetCoverModel::ComputeSparseRowViewUsingSlices() {
|
|
StopWatch stop_watch(&compute_sparse_row_view_using_slices_duration_);
|
|
SparseRowView rows;
|
|
VLOG(1) << "CreateSparseRowViewUsingSlices started";
|
|
const std::vector<SubsetIndex> partition_points =
|
|
ComputeSliceIndices(num_subsets());
|
|
const std::vector<SparseRowView> slices =
|
|
CutSparseRowViewInSlices(partition_points);
|
|
rows = ReduceSparseRowViewSlices(slices);
|
|
VLOG(1) << "CreateSparseRowViewUsingSlices finished";
|
|
return rows;
|
|
}
|
|
|
|
bool SetCoverModel::ComputeFeasibility() {
|
|
StopWatch stop_watch(&feasibility_duration_);
|
|
CHECK_GT(num_elements(), 0);
|
|
CHECK_GT(num_subsets(), 0);
|
|
CHECK_EQ(columns_.size(), num_subsets());
|
|
CHECK_EQ(subset_costs_.size(), num_subsets());
|
|
CHECK_EQ(all_subsets_.size(), num_subsets());
|
|
for (const Cost cost : subset_costs_) {
|
|
CHECK_GT(cost, 0.0);
|
|
}
|
|
ElementToIntVector possible_coverage(num_elements_, 0);
|
|
SubsetIndex column_index(0);
|
|
for (const SparseColumn& column : columns_) {
|
|
DLOG_IF(INFO, column.empty()) << "Empty column " << column_index.value();
|
|
for (const ElementIndex element : column) {
|
|
++possible_coverage[element];
|
|
}
|
|
// NOMUTANTS -- column_index is only used for logging in debug mode.
|
|
++column_index;
|
|
}
|
|
int64_t num_uncoverable_elements = 0;
|
|
for (const ElementIndex element : ElementRange()) {
|
|
// NOMUTANTS -- num_uncoverable_elements is only used for logging.
|
|
if (possible_coverage[element] == 0) {
|
|
++num_uncoverable_elements;
|
|
}
|
|
}
|
|
VLOG(1) << "num_uncoverable_elements = " << num_uncoverable_elements;
|
|
for (const ElementIndex element : ElementRange()) {
|
|
if (possible_coverage[element] > 0) continue;
|
|
LOG(ERROR) << "Element " << element << " is not covered.";
|
|
return false;
|
|
}
|
|
VLOG(1) << "Max possible coverage = "
|
|
<< *std::max_element(possible_coverage.begin(),
|
|
possible_coverage.end());
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
if (all_subsets_[subset.value()] == subset) continue;
|
|
LOG(ERROR) << "subset = " << subset << " all_subsets_[subset.value()] = "
|
|
<< all_subsets_[subset.value()];
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
SetCoverProto SetCoverModel::ExportModelAsProto() const {
|
|
CHECK(elements_in_subsets_are_sorted_);
|
|
SetCoverProto message;
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
VLOG_EVERY_N_SEC(1, 5) << absl::StrFormat(
|
|
"Exporting subset %d (%.1f%%)", subset.value(),
|
|
100.0 * subset.value() / num_subsets());
|
|
SetCoverProto::Subset* subset_proto = message.add_subset();
|
|
subset_proto->set_cost(subset_costs_[subset]);
|
|
SparseColumn column = columns_[subset]; // Copy is intentional.
|
|
BaseInt* data = reinterpret_cast<BaseInt*>(column.data());
|
|
RadixSort(absl::MakeSpan(data, column.size()));
|
|
for (const ElementIndex element : column) {
|
|
subset_proto->add_element(element.value());
|
|
}
|
|
}
|
|
VLOG(1) << "Finished exporting the model.";
|
|
return message;
|
|
}
|
|
|
|
void SetCoverModel::ImportModelFromProto(const SetCoverProto& message) {
|
|
columns_.clear();
|
|
subset_costs_.clear();
|
|
ResizeNumSubsets(message.subset_size());
|
|
SubsetIndex subset_index(0);
|
|
for (const SetCoverProto::Subset& subset_proto : message.subset()) {
|
|
subset_costs_[subset_index] = subset_proto.cost();
|
|
if (subset_proto.element_size() > 0) {
|
|
columns_[subset_index].reserve(
|
|
ColumnEntryIndex(subset_proto.element_size()));
|
|
for (const BaseInt element : subset_proto.element()) {
|
|
columns_[subset_index].push_back(ElementIndex(element));
|
|
num_elements_ = std::max(num_elements_, element + 1);
|
|
}
|
|
++subset_index;
|
|
}
|
|
}
|
|
UpdateAllSubsetsList();
|
|
CreateSparseRowView();
|
|
}
|
|
|
|
std::string SetCoverModel::ToVerboseString(absl::string_view sep) const {
|
|
return absl::StrJoin(
|
|
std::make_tuple("num_elements", num_elements(), "num_subsets",
|
|
num_subsets(), "num_nonzeros", num_nonzeros(),
|
|
"fill_rate", FillRate()),
|
|
sep);
|
|
}
|
|
|
|
std::string SetCoverModel::ToString(absl::string_view sep) const {
|
|
return absl::StrJoin(std::make_tuple(num_elements(), num_subsets(),
|
|
num_nonzeros(), FillRate()),
|
|
sep);
|
|
}
|
|
|
|
namespace {
|
|
// Returns the standard deviation of the vector v, excluding those values that
|
|
// are zero.
|
|
template <typename T>
|
|
double StandardDeviation(const std::vector<T>& values) {
|
|
const size_t size = values.size();
|
|
double n = 0.0; // n is used in a calculation involving doubles.
|
|
double sum_of_squares = 0.0;
|
|
double sum = 0.0;
|
|
for (size_t i = 0; i < size; ++i) {
|
|
double sample = static_cast<double>(values[i]);
|
|
if (sample == 0.0) continue;
|
|
sum_of_squares += sample * sample;
|
|
sum += sample;
|
|
++n;
|
|
}
|
|
// Since we know all the values, we can compute the standard deviation
|
|
// exactly.
|
|
return n == 0.0 ? 0.0 : sqrt((sum_of_squares - sum * sum / n) / n);
|
|
}
|
|
|
|
// Statistics accumulation class used to compute statistics on the deltas of
|
|
// the row and column elements and their sizes in bytes.
|
|
// Since the values are not all stored, it's not possible to compute the median
|
|
// exactly. It is returned as 0.0. NaN would be a better choice, but it's just
|
|
// not a good idea as NaNs can propagate and cause problems.
|
|
class StatsAccumulator {
|
|
public:
|
|
StatsAccumulator()
|
|
: count_(0),
|
|
min_(kInfinity),
|
|
max_(-kInfinity),
|
|
sum_(0.0),
|
|
sum_of_squares_(0.0) {}
|
|
|
|
void Register(double value) {
|
|
++count_;
|
|
min_ = std::min(min_, value);
|
|
max_ = std::max(max_, value);
|
|
sum_ += value;
|
|
sum_of_squares_ += value * value;
|
|
}
|
|
|
|
SetCoverModel::Stats ComputeStats() const {
|
|
const int64_t n = count_;
|
|
// Since the code is used on a known number of values, we can compute the
|
|
// standard deviation exactly, even if the values are not all stored.
|
|
const double stddev =
|
|
n == 0 ? 0.0 : sqrt((sum_of_squares_ - sum_ * sum_ / n) / n);
|
|
return SetCoverModel::Stats{min_, max_, 0.0, sum_ / n, stddev};
|
|
}
|
|
|
|
private:
|
|
static constexpr double kInfinity = std::numeric_limits<double>::infinity();
|
|
int64_t count_;
|
|
double min_;
|
|
double max_;
|
|
double sum_;
|
|
double sum_of_squares_;
|
|
};
|
|
} // namespace
|
|
|
|
template <typename T>
|
|
SetCoverModel::Stats ComputeStats(std::vector<T> samples) {
|
|
SetCoverModel::Stats stats;
|
|
stats.min = *std::min_element(samples.begin(), samples.end());
|
|
stats.max = *std::max_element(samples.begin(), samples.end());
|
|
stats.mean =
|
|
std::accumulate(samples.begin(), samples.end(), 0.0) / samples.size();
|
|
auto const q1 = samples.size() / 4;
|
|
auto const q2 = samples.size() / 2;
|
|
auto const q3 = q1 + q2;
|
|
// The first call to nth_element is O(n). The 2nd and 3rd calls are O(n / 2).
|
|
// Basically it's equivalent to running nth_element twice.
|
|
// One should be tempted to use a faster sorting algorithm like radix sort,
|
|
// it is not sure that we would gain a lot. There would be no gain in
|
|
// complexity whatsoever anyway. On top of that, this code is called at most
|
|
// one per problem, so it's not worth the effort.
|
|
std::nth_element(samples.begin(), samples.begin() + q2, samples.end());
|
|
std::nth_element(samples.begin(), samples.begin() + q1, samples.begin() + q2);
|
|
std::nth_element(samples.begin() + q2, samples.begin() + q3, samples.end());
|
|
stats.median = samples[samples.size() / 2];
|
|
stats.iqr = samples[q3] - samples[q1];
|
|
stats.stddev = StandardDeviation(samples);
|
|
return stats;
|
|
}
|
|
|
|
template <typename T>
|
|
std::vector<T> ComputeDeciles(std::vector<T> values) {
|
|
const int kNumDeciles = 10;
|
|
std::vector<T> deciles(kNumDeciles, T{0});
|
|
const double step = values.size() / kNumDeciles;
|
|
for (int i = 1; i < kNumDeciles; ++i) {
|
|
const size_t point = std::clamp<double>(i * step, 0, values.size() - 1);
|
|
std::nth_element(values.begin(), values.begin() + point, values.end());
|
|
deciles[i] = values[point];
|
|
}
|
|
return deciles;
|
|
}
|
|
|
|
SetCoverModel::Stats SetCoverModel::ComputeCostStats() const {
|
|
std::vector<Cost> subset_costs(num_subsets());
|
|
std::copy(subset_costs_.begin(), subset_costs_.end(), subset_costs.begin());
|
|
return ComputeStats(std::move(subset_costs));
|
|
}
|
|
|
|
SetCoverModel::Stats SetCoverModel::ComputeRowStats() const {
|
|
std::vector<int64_t> row_sizes(num_elements(), 0);
|
|
for (const SparseColumn& column : columns_) {
|
|
for (const ElementIndex element : column) {
|
|
++row_sizes[element.value()];
|
|
}
|
|
}
|
|
return ComputeStats(std::move(row_sizes));
|
|
}
|
|
|
|
SetCoverModel::Stats SetCoverModel::ComputeColumnStats() const {
|
|
std::vector<int64_t> column_sizes(columns_.size());
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
column_sizes[subset.value()] = columns_[subset].size();
|
|
}
|
|
return ComputeStats(std::move(column_sizes));
|
|
}
|
|
|
|
std::string SetCoverModel::Stats::ToString(absl::string_view sep) const {
|
|
return absl::StrJoin(std::make_tuple(min, max, median, mean, stddev, iqr),
|
|
sep);
|
|
}
|
|
|
|
std::string SetCoverModel::Stats::ToVerboseString(absl::string_view sep) const {
|
|
return absl::StrJoin(
|
|
std::make_tuple("min", min, "max", max, "median", median, "mean", mean,
|
|
"stddev", stddev, "iqr", iqr),
|
|
sep);
|
|
}
|
|
|
|
std::vector<int64_t> SetCoverModel::ComputeRowDeciles() const {
|
|
std::vector<int64_t> row_sizes(num_elements(), 0);
|
|
for (const SparseColumn& column : columns_) {
|
|
for (const ElementIndex element : column) {
|
|
++row_sizes[element.value()];
|
|
}
|
|
}
|
|
return ComputeDeciles(std::move(row_sizes));
|
|
}
|
|
|
|
std::vector<int64_t> SetCoverModel::ComputeColumnDeciles() const {
|
|
std::vector<int64_t> column_sizes(columns_.size());
|
|
for (const SubsetIndex subset : SubsetRange()) {
|
|
column_sizes[subset.value()] = columns_[subset].size();
|
|
}
|
|
return ComputeDeciles(std::move(column_sizes));
|
|
}
|
|
|
|
namespace {
|
|
// Returns the number of bytes needed to store x with a base-128 encoding.
|
|
BaseInt Base128SizeInBytes(BaseInt x) {
|
|
const uint64_t u = x == 0 ? 1 : static_cast<uint64_t>(x);
|
|
return (std::numeric_limits<uint64_t>::digits - absl::countl_zero(u) + 6) / 7;
|
|
}
|
|
} // namespace
|
|
|
|
SetCoverModel::Stats SetCoverModel::ComputeColumnDeltaSizeStats() const {
|
|
StatsAccumulator acc;
|
|
for (const SparseColumn& column : columns_) {
|
|
int64_t previous = 0;
|
|
for (const ElementIndex element : column) {
|
|
const int64_t delta = element.value() - previous;
|
|
previous = element.value();
|
|
acc.Register(Base128SizeInBytes(delta));
|
|
}
|
|
}
|
|
return acc.ComputeStats();
|
|
}
|
|
|
|
SetCoverModel::Stats SetCoverModel::ComputeRowDeltaSizeStats() const {
|
|
StatsAccumulator acc;
|
|
for (const SparseRow& row : rows_) {
|
|
int64_t previous = 0;
|
|
for (const SubsetIndex subset : row) {
|
|
const int64_t delta = subset.value() - previous;
|
|
previous = subset.value();
|
|
acc.Register(Base128SizeInBytes(delta));
|
|
}
|
|
}
|
|
return acc.ComputeStats();
|
|
}
|
|
} // namespace operations_research
|