diff --git a/infra/indexer/frontend/frontend.cc b/infra/indexer/frontend/frontend.cc index 96083ba7fa29..662ff589f5fe 100644 --- a/infra/indexer/frontend/frontend.cc +++ b/infra/indexer/frontend/frontend.cc @@ -118,12 +118,13 @@ clang::tooling::CommandLineArguments ExtraArgumentsAdjuster( // perform indexing on a compilation database. std::vector, clang::tooling::ArgumentsAdjuster>> -GetIndexActions(FileCopier& file_copier, MergeQueue& merge_queue) { +GetIndexActions(FileCopier& file_copier, MergeQueue& merge_queue, + bool support_incremental_indexing) { std::vector, clang::tooling::ArgumentsAdjuster>> actions; - auto index_action = - std::make_unique(file_copier, merge_queue); + auto index_action = std::make_unique( + file_copier, merge_queue, support_incremental_indexing); auto adjuster = clang::tooling::combineAdjusters(RemoveClangArgumentsAdjuster, ExtraArgumentsAdjuster); actions.push_back( diff --git a/infra/indexer/frontend/frontend.h b/infra/indexer/frontend/frontend.h index 633057306d88..82423643c4c0 100644 --- a/infra/indexer/frontend/frontend.h +++ b/infra/indexer/frontend/frontend.h @@ -39,7 +39,8 @@ std::vector ParseCommandLine(absl::string_view commandLine); // perform indexing on a compilation database. std::vector, clang::tooling::ArgumentsAdjuster>> -GetIndexActions(FileCopier& file_copier, MergeQueue& merge_queue); +GetIndexActions(FileCopier& file_copier, MergeQueue& merge_queue, + bool support_incremental_indexing = false); } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/frontend/index_action.cc b/infra/indexer/frontend/index_action.cc index acabee106530..f95acfceb2c8 100644 --- a/infra/indexer/frontend/index_action.cc +++ b/infra/indexer/frontend/index_action.cc @@ -29,6 +29,8 @@ #include "absl/strings/match.h" #include "absl/strings/string_view.h" #include "clang/AST/ASTConsumer.h" +#include "clang/Basic/FileEntry.h" +#include "clang/Basic/SourceLocation.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Lex/Pragma.h" #include "clang/Lex/Preprocessor.h" @@ -41,11 +43,28 @@ namespace oss_fuzz { namespace indexer { class AstConsumer : public clang::ASTConsumer { public: - explicit AstConsumer(InMemoryIndex& index, clang::CompilerInstance& compiler) - : index_(index), compiler_(compiler) {} + AstConsumer(InMemoryIndex& index, clang::CompilerInstance& compiler, + bool support_incremental_indexing = false) + : index_(index), + compiler_(compiler), + support_incremental_indexing_(support_incremental_indexing) {} ~AstConsumer() override = default; void HandleTranslationUnit(clang::ASTContext& context) override { + if (support_incremental_indexing_) { + const clang::SourceManager& source_manager = context.getSourceManager(); + const clang::FileID main_file_id = source_manager.getMainFileID(); + const clang::OptionalFileEntryRef main_file = + source_manager.getFileEntryRefForID(main_file_id); + CHECK(main_file.has_value()) << "Couldn't retrieve the main file entry"; + + const clang::FileManager& file_manager = source_manager.getFileManager(); + llvm::SmallString<256> absolute_path(main_file->getName()); + file_manager.makeAbsolutePath(absolute_path); + + index_.SetTranslationUnit({absolute_path.data(), absolute_path.size()}); + } + AstVisitor visitor(index_, context, compiler_); visitor.TraverseDecl(context.getTranslationUnitDecl()); } @@ -53,11 +72,14 @@ class AstConsumer : public clang::ASTConsumer { private: InMemoryIndex& index_; clang::CompilerInstance& compiler_; + const bool support_incremental_indexing_; }; -IndexAction::IndexAction(FileCopier& file_copier, MergeQueue& merge_queue) +IndexAction::IndexAction(FileCopier& file_copier, MergeQueue& merge_queue, + bool support_incremental_indexing) : index_(std::make_unique(file_copier)), - merge_queue_(merge_queue) {} + merge_queue_(merge_queue), + support_incremental_indexing_(support_incremental_indexing) {} bool IndexAction::BeginSourceFileAction(clang::CompilerInstance& compiler) { CHECK(index_); @@ -79,15 +101,20 @@ void IndexAction::EndSourceFileAction() { merge_queue_.Add(std::move(index_)); } std::unique_ptr IndexAction::CreateASTConsumer( clang::CompilerInstance& compiler, llvm::StringRef path) { - return std::make_unique(*index_, compiler); + return std::make_unique(*index_, compiler, + support_incremental_indexing_); } IndexActionFactory::IndexActionFactory(FileCopier& file_copier, - MergeQueue& merge_queue) - : file_copier_(file_copier), merge_queue_(merge_queue) {} + MergeQueue& merge_queue, + bool support_incremental_indexing) + : file_copier_(file_copier), + merge_queue_(merge_queue), + support_incremental_indexing_(support_incremental_indexing) {} std::unique_ptr IndexActionFactory::create() { - return std::make_unique(file_copier_, merge_queue_); + return std::make_unique(file_copier_, merge_queue_, + support_incremental_indexing_); } } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/frontend/index_action.h b/infra/indexer/frontend/index_action.h index 5b00e5487ecb..104719955349 100644 --- a/infra/indexer/frontend/index_action.h +++ b/infra/indexer/frontend/index_action.h @@ -16,6 +16,7 @@ #define OSS_FUZZ_INFRA_INDEXER_FRONTEND_INDEX_ACTION_H_ #include +#include #include "indexer/index/file_copier.h" #include "indexer/index/in_memory_index.h" @@ -32,28 +33,32 @@ namespace indexer { // indexer/frontend.h should be used instead. class IndexAction : public clang::ASTFrontendAction { public: - explicit IndexAction(FileCopier& file_copier, MergeQueue& merge_queue); + explicit IndexAction(FileCopier& file_copier, MergeQueue& merge_queue, + bool support_incremental_indexing = false); bool BeginSourceFileAction(clang::CompilerInstance& compiler) override; void EndSourceFileAction() override; std::unique_ptr CreateASTConsumer( - clang::CompilerInstance& compiler, llvm::StringRef) override; + clang::CompilerInstance& compiler, llvm::StringRef path) override; private: std::unique_ptr index_; MergeQueue& merge_queue_; + bool support_incremental_indexing_; }; class IndexActionFactory : public clang::tooling::FrontendActionFactory { public: - explicit IndexActionFactory(FileCopier& file_copier, MergeQueue& merge_queue); + IndexActionFactory(FileCopier& file_copier, MergeQueue& merge_queue, + bool support_incremental_indexing = false); std::unique_ptr create() override; private: FileCopier& file_copier_; MergeQueue& merge_queue_; + const bool support_incremental_indexing_; }; } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/index/in_memory_index.cc b/infra/indexer/index/in_memory_index.cc index 5eb718a268b7..c9c7c34e7c3f 100644 --- a/infra/indexer/index/in_memory_index.cc +++ b/infra/indexer/index/in_memory_index.cc @@ -27,9 +27,11 @@ #include "indexer/index/types.h" #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "absl/container/inlined_vector.h" #include "absl/container/node_hash_map.h" #include "absl/log/check.h" #include "absl/log/log.h" +#include "absl/strings/string_view.h" #include "absl/types/span.h" namespace oss_fuzz { @@ -112,7 +114,233 @@ struct ComparePairFirst { InMemoryIndex::InMemoryIndex(FileCopier& file_copier) : file_copier_(file_copier) { Expand(kInitialReservationCount, kInitialReservationCount, - kInitialReservationCount, kInitialReservationCount); + kInitialReservationCount, kInitialReservationCount, + kInitialReservationCount, kInitialReservationCount, + kInitialReservationCount); +} + +InMemoryIndex::InMemoryIndex( + FileCopier& file_copier, const FlatIndex& flat_index, + const std::vector& excluded_tu_absolute_paths) + : file_copier_(file_copier) { + absl::flat_hash_set excluded_entity_ids; + absl::flat_hash_set excluded_reference_ids; + absl::flat_hash_set excluded_tu_ids; + + if (!excluded_tu_absolute_paths.empty()) { + CHECK(flat_index.incremental_indexing_metadata.has_value()) + << "Excluding translation units requires incremental indexing metadata"; + const IncrementalIndexingMetadata& incremental_metadata = + *flat_index.incremental_indexing_metadata; + + absl::flat_hash_set excluded_tu_index_paths; + for (const auto& path : excluded_tu_absolute_paths) { + excluded_tu_index_paths.insert(file_copier.AbsoluteToIndexPath(path)); + } + + TranslationUnitId tu_id = 0; + for (const TranslationUnit& tu : incremental_metadata.translation_units) { + if (excluded_tu_index_paths.contains(tu.index_path())) { + excluded_tu_ids.insert(tu_id); + } + ++tu_id; + } + + absl::flat_hash_map> + entity_to_tus; + for (const auto& entity_translation_unit : + flat_index.incremental_indexing_metadata->entity_translation_units) { + entity_to_tus[entity_translation_unit.entity_id()].insert( + entity_translation_unit.tu_id()); + } + + for (const auto& [entity_id, tu_ids] : entity_to_tus) { + bool all_excluded = true; + for (const auto& tu_id : tu_ids) { + if (!excluded_tu_ids.contains(tu_id)) { + all_excluded = false; + break; + } + } + if (all_excluded) { + excluded_entity_ids.insert(entity_id); + } + } + + // Check that referenced substitute entities are preserved. + // This should be guaranteed by the fact that an entity being in an included + // translation unit implies its substitute entity is also there. + for (size_t entity_id = 0; entity_id < flat_index.entities.size(); + ++entity_id) { + if (excluded_entity_ids.contains(entity_id)) { + continue; + } + const Entity& entity = flat_index.entities[entity_id]; + if (entity.substitute_relationship().has_value()) { + CHECK(!excluded_entity_ids.contains( + entity.substitute_relationship()->substitute_entity_id())); + } + } + + // The same applies to virtual method links. + for (const VirtualMethodLink& link : flat_index.virtual_method_links) { + CHECK(!excluded_entity_ids.contains(link.parent())); + CHECK(!excluded_entity_ids.contains(link.child())); + } + + // Exclude references to entities that have been excluded. + for (size_t reference_id = 0; reference_id < flat_index.references.size(); + ++reference_id) { + if (excluded_entity_ids.contains( + flat_index.references[reference_id].entity_id())) { + excluded_reference_ids.insert(reference_id); + } + } + + // Exclude references only occurring in the excluded translation units (even + // if the referenced entities are not excluded). + absl::flat_hash_map> + reference_to_tus; + for (const auto& reference_translation_unit : + flat_index.incremental_indexing_metadata + ->reference_translation_units) { + reference_to_tus[reference_translation_unit.reference_id()].insert( + reference_translation_unit.tu_id()); + } + + for (const auto& [reference_id, tu_ids] : reference_to_tus) { + bool all_excluded = true; + for (const auto& tu_id : tu_ids) { + if (!excluded_tu_ids.contains(tu_id)) { + all_excluded = false; + break; + } + } + if (all_excluded) { + excluded_reference_ids.insert(reference_id); + } + } + } + + absl::flat_hash_set locations_to_keep; + for (size_t entity_id = 0; entity_id < flat_index.entities.size(); + ++entity_id) { + if (!excluded_entity_ids.contains(entity_id)) { + locations_to_keep.insert(flat_index.entities[entity_id].location_id()); + } + } + for (size_t reference_id = 0; reference_id < flat_index.references.size(); + ++reference_id) { + if (!excluded_reference_ids.contains(reference_id)) { + locations_to_keep.insert( + flat_index.references[reference_id].location_id()); + } + } + + std::vector new_location_ids(flat_index.locations.size(), + kInvalidLocationId); + for (size_t location_id = 0; location_id < flat_index.locations.size(); + ++location_id) { + if (locations_to_keep.contains(location_id)) { + new_location_ids[location_id] = + GetIdForLocationWithIndexPath(flat_index.locations[location_id]); + } + } + + std::vector new_entity_ids(flat_index.entities.size(), + kInvalidEntityId); + for (size_t entity_id = 0; entity_id < flat_index.entities.size(); + ++entity_id) { + if (excluded_entity_ids.contains(entity_id)) { + continue; + } + if (new_entity_ids[entity_id] != kInvalidEntityId) { + continue; + } + absl::InlinedVector entities_to_process; + EntityId referenced_entity_id = entity_id; + while (true) { + entities_to_process.push_back(referenced_entity_id); + const Entity& entity = flat_index.entities[referenced_entity_id]; + if (entity.substitute_relationship().has_value() && + new_entity_ids[entity.substitute_relationship() + ->substitute_entity_id()] == kInvalidEntityId) { + referenced_entity_id = + entity.substitute_relationship()->substitute_entity_id(); + } else { + break; + } + } + + // Process the substitute entities in reverse order to ensure the referenced + // entity IDs are ready whenever necessary. + for (auto it = entities_to_process.rbegin(); + it != entities_to_process.rend(); ++it) { + const EntityId processed_entity_id = *it; + const Entity& entity = flat_index.entities[processed_entity_id]; + const LocationId new_location_id = new_location_ids[entity.location_id()]; + const std::optional new_substitute_entity_id = + entity.substitute_relationship().has_value() + ? std::optional( + new_entity_ids[entity.substitute_relationship() + ->substitute_entity_id()]) + : std::nullopt; + new_entity_ids[processed_entity_id] = GetEntityId( + Entity(entity, new_location_id, new_substitute_entity_id)); + } + } + + std::vector new_reference_ids(flat_index.references.size(), + kInvalidReferenceId); + for (size_t reference_id = 0; reference_id < flat_index.references.size(); + ++reference_id) { + if (!excluded_reference_ids.contains(reference_id)) { + const Reference& reference = flat_index.references[reference_id]; + new_reference_ids[reference_id] = + GetReferenceId({new_entity_ids[reference.entity_id()], + new_location_ids[reference.location_id()]}); + } + } + + for (size_t i = 0; i < flat_index.virtual_method_links.size(); ++i) { + const VirtualMethodLink& link = flat_index.virtual_method_links[i]; + if (!excluded_entity_ids.contains(link.parent()) && + !excluded_entity_ids.contains(link.child())) { + GetVirtualMethodLinkId( + {new_entity_ids[link.parent()], new_entity_ids[link.child()]}); + } + } + + if (flat_index.incremental_indexing_metadata.has_value()) { + const auto& metadata = *flat_index.incremental_indexing_metadata; + std::vector new_tu_ids(metadata.translation_units.size(), + kInvalidTranslationUnitId); + for (size_t tu_id = 0; tu_id < metadata.translation_units.size(); ++tu_id) { + if (excluded_tu_ids.contains(tu_id)) { + continue; + } + new_tu_ids[tu_id] = + GetTranslationUnitId(metadata.translation_units[tu_id]); + } + for (const auto& entity_translation_unit : + metadata.entity_translation_units) { + if (excluded_tu_ids.contains(entity_translation_unit.tu_id())) { + continue; + } + AddEntityTranslationUnit( + {new_entity_ids[entity_translation_unit.entity_id()], + new_tu_ids[entity_translation_unit.tu_id()]}); + } + for (const auto& reference_translation_unit : + metadata.reference_translation_units) { + if (excluded_tu_ids.contains(reference_translation_unit.tu_id())) { + continue; + } + AddReferenceTranslationUnit( + {new_reference_ids[reference_translation_unit.reference_id()], + new_tu_ids[reference_translation_unit.tu_id()]}); + } + } } InMemoryIndex::~InMemoryIndex() = default; @@ -123,7 +351,10 @@ void InMemoryIndex::Merge(const InMemoryIndex& other) { // this is not an issue, since we almost always use the same indexes to merge // into, so the overly-large reservation will be used later. Expand(other.locations_.size(), other.entities_.size(), - other.references_.size(), other.virtual_method_links_.size()); + other.references_.size(), other.virtual_method_links_.size(), + other.translation_units_.size(), + other.entity_translation_units_.size(), + other.reference_translation_units_.size()); std::vector new_location_ids(other.locations_.size(), kInvalidLocationId); @@ -170,25 +401,63 @@ void InMemoryIndex::Merge(const InMemoryIndex& other) { new_entity_ids[id] = new_id; } + std::vector new_reference_ids(other.references_.size(), + kInvalidReferenceId); for (const auto& [reference, id] : other.references_) { - GetReferenceId({new_entity_ids[reference.entity_id()], - new_location_ids[reference.location_id()]}); + new_reference_ids[id] = + GetReferenceId({new_entity_ids[reference.entity_id()], + new_location_ids[reference.location_id()]}); } for (const auto& [link, id] : other.virtual_method_links_) { GetVirtualMethodLinkId( {new_entity_ids[link.parent()], new_entity_ids[link.child()]}); } + + std::vector new_translation_unit_ids( + other.translation_units_.size(), kInvalidEntityId); + for (const auto& [tu, id] : other.translation_units_) { + new_translation_unit_ids[id] = GetTranslationUnitId(tu); + } + + for (const auto& entity_translation_unit : other.entity_translation_units_) { + AddEntityTranslationUnit( + {new_entity_ids[entity_translation_unit.entity_id()], + new_translation_unit_ids[entity_translation_unit.tu_id()]}); + } + + for (const auto& reference_translation_unit : + other.reference_translation_units_) { + AddReferenceTranslationUnit( + {new_reference_ids[reference_translation_unit.reference_id()], + new_translation_unit_ids[reference_translation_unit.tu_id()]}); + } } void InMemoryIndex::Expand(size_t locations_count, size_t entities_count, size_t references_count, - size_t virtual_method_links_count) { + size_t virtual_method_links_count, + size_t translation_units_count, + size_t entity_translation_units_count, + size_t reference_translation_units_count) { locations_.reserve(locations_.size() + locations_count); entities_.reserve(entities_.size() + entities_count); references_.reserve(references_.size() + references_count); virtual_method_links_.reserve(virtual_method_links_.size() + virtual_method_links_count); + translation_units_.reserve(translation_units_.size() + + translation_units_count); + entity_translation_units_.reserve(entity_translation_units_.size() + + entity_translation_units_count); + reference_translation_units_.reserve(reference_translation_units_.size() + + reference_translation_units_count); +} + +void InMemoryIndex::SetTranslationUnit(absl::string_view absolute_path) { + const std::string index_path = + file_copier_.AbsoluteToIndexPath(absolute_path); + current_translation_unit_id_ = + GetTranslationUnitId(TranslationUnit(index_path)); } LocationId InMemoryIndex::GetLocationId(Location location) { @@ -234,6 +503,11 @@ EntityId InMemoryIndex::GetEntityId(const Entity& entity) { CHECK_LT(relationship->substitute_entity_id(), entity_id); } } + + if (current_translation_unit_id_ != kInvalidTranslationUnitId) { + AddEntityTranslationUnit({entity_id, current_translation_unit_id_}); + } + return entity_id; } @@ -256,7 +530,13 @@ ReferenceId InMemoryIndex::GetReferenceId(const Reference& reference) { if (inserted) { next_reference_id_++; } - return iter->second; + const ReferenceId reference_id = iter->second; + + if (current_translation_unit_id_ != kInvalidTranslationUnitId) { + AddReferenceTranslationUnit({reference_id, current_translation_unit_id_}); + } + + return reference_id; } VirtualMethodLinkId InMemoryIndex::GetVirtualMethodLinkId( @@ -269,6 +549,26 @@ VirtualMethodLinkId InMemoryIndex::GetVirtualMethodLinkId( return iter->second; } +TranslationUnitId InMemoryIndex::GetTranslationUnitId( + const TranslationUnit& tu) { + auto [iter, inserted] = + translation_units_.insert({tu, next_translation_unit_id_}); + if (inserted) { + next_translation_unit_id_++; + } + return iter->second; +} + +void InMemoryIndex::AddEntityTranslationUnit( + const EntityTranslationUnit& entity_translation_unit) { + entity_translation_units_.insert(entity_translation_unit); +} + +void InMemoryIndex::AddReferenceTranslationUnit( + const ReferenceTranslationUnit& reference_translation_unit) { + reference_translation_units_.insert(reference_translation_unit); +} + FlatIndex InMemoryIndex::Export() && { FlatIndex result; @@ -301,7 +601,7 @@ FlatIndex InMemoryIndex::Export() && { } } - std::vector new_entity_ids(entities_.size(), kInvalidEntityId); + std::vector new_entity_ids(entities_.size(), kInvalidEntityId); { // Repeat for entities, but updating stale location ids. std::vector> sorted_entities; @@ -314,8 +614,8 @@ FlatIndex InMemoryIndex::Export() && { CHECK_LT(entity.substitute_relationship()->substitute_entity_id(), id); } - auto& iter = sorted_entities.emplace_back(entity, id); - Entity& new_entity = iter.first; + sorted_entities.emplace_back(entity, id); + Entity& new_entity = sorted_entities.back().first; new_entity.location_id_ = new_location_id; } std::sort(sorted_entities.begin(), sorted_entities.end(), @@ -363,22 +663,38 @@ FlatIndex InMemoryIndex::Export() && { } } - // Here we don't need to maintain a mapping from the old to the new reference - // ids. - result.references.reserve(references_.size()); - for (const auto& [reference, id] : references_) { - EntityId new_entity_id = new_entity_ids[reference.entity_id()]; - CHECK_NE(new_entity_id, kInvalidEntityId); - LocationId new_location_id = new_location_ids[reference.location_id()]; - CHECK_NE(new_location_id, kInvalidLocationId); - result.references.emplace_back(new_entity_id, new_location_id); + // Maintain old-to-new reference ids for the case of reference-TU pairs below. + std::vector new_reference_ids(references_.size(), + kInvalidReferenceId); + { + std::vector> sorted_references; + for (const auto& [reference, id] : references_) { + EntityId new_entity_id = new_entity_ids[reference.entity_id()]; + CHECK_NE(new_entity_id, kInvalidEntityId); + LocationId new_location_id = new_location_ids[reference.location_id()]; + CHECK_NE(new_location_id, kInvalidLocationId); + sorted_references.emplace_back(Reference(new_entity_id, new_location_id), + id); + } + std::sort(sorted_references.begin(), sorted_references.end(), + ComparePairFirst()); + CHECK_EQ(sorted_references.size(), references_.size()); + references_.clear(); + + result.references.reserve(references_.size()); + ReferenceId new_id = 0; + for (auto& [reference, old_id] : sorted_references) { + // Avoid duplicate references that may have arisen from entity linkage. + if (!result.references.empty() && result.references.back() == reference) { + new_reference_ids[old_id] = new_id - 1; + } else { + result.references.emplace_back(reference); + new_reference_ids[old_id] = new_id++; + } + } } - std::sort(result.references.begin(), result.references.end()); - // Remove duplicates that could have arisen due to location column erasure. - auto last = std::unique(result.references.begin(), result.references.end()); - result.references.erase(last, result.references.end()); - // Likewise, no need to maintain the old-to-new link id mapping. + // We don't have to maintain the old-to-new link id mapping. result.virtual_method_links.reserve(virtual_method_links_.size()); for (const auto& [link, id] : virtual_method_links_) { EntityId new_parent = new_entity_ids[link.parent()]; @@ -390,6 +706,70 @@ FlatIndex InMemoryIndex::Export() && { std::sort(result.virtual_method_links.begin(), result.virtual_method_links.end()); + if (!translation_units_.empty()) { + auto& metadata = result.incremental_indexing_metadata.emplace(); + + std::vector new_translation_unit_ids( + translation_units_.size(), kInvalidEntityId); + { + std::vector> sorted_tus; + for (const auto& [tu, id] : translation_units_) { + sorted_tus.emplace_back(tu, id); + } + std::sort(sorted_tus.begin(), sorted_tus.end(), ComparePairFirst()); + CHECK_EQ(sorted_tus.size(), translation_units_.size()); + translation_units_.clear(); + + metadata.translation_units.reserve(sorted_tus.size()); + TranslationUnitId new_id = 0; + for (auto& [tu, old_id] : sorted_tus) { + metadata.translation_units.emplace_back(tu); + new_translation_unit_ids[old_id] = new_id++; + } + } + + metadata.entity_translation_units.reserve(entity_translation_units_.size()); + for (const auto& entity_translation_unit : entity_translation_units_) { + EntityId new_entity_id = + new_entity_ids[entity_translation_unit.entity_id()]; + CHECK_NE(new_entity_id, kInvalidEntityId); + TranslationUnitId new_tu_id = + new_translation_unit_ids[entity_translation_unit.tu_id()]; + CHECK_NE(new_tu_id, kInvalidTranslationUnitId); + metadata.entity_translation_units.emplace_back(new_entity_id, new_tu_id); + } + std::sort(metadata.entity_translation_units.begin(), + metadata.entity_translation_units.end()); + // Remove duplicate entity-TU pairs that may have arisen from entity + // linkage. + metadata.entity_translation_units.erase( + std::unique(metadata.entity_translation_units.begin(), + metadata.entity_translation_units.end()), + metadata.entity_translation_units.end()); + + metadata.reference_translation_units.reserve( + reference_translation_units_.size()); + for (const auto& reference_translation_unit : + reference_translation_units_) { + ReferenceId new_reference_id = + new_reference_ids[reference_translation_unit.reference_id()]; + CHECK_NE(new_reference_id, kInvalidReferenceId); + TranslationUnitId new_tu_id = + new_translation_unit_ids[reference_translation_unit.tu_id()]; + CHECK_NE(new_tu_id, kInvalidTranslationUnitId); + metadata.reference_translation_units.emplace_back(new_reference_id, + new_tu_id); + } + std::sort(metadata.reference_translation_units.begin(), + metadata.reference_translation_units.end()); + // Remove duplicate reference-TU pairs that may have arisen from entity + // linkage. + metadata.reference_translation_units.erase( + std::unique(metadata.reference_translation_units.begin(), + metadata.reference_translation_units.end()), + metadata.reference_translation_units.end()); + } + return result; } diff --git a/infra/indexer/index/in_memory_index.h b/infra/indexer/index/in_memory_index.h index 46894b98400f..9c54beb36bd8 100644 --- a/infra/indexer/index/in_memory_index.h +++ b/infra/indexer/index/in_memory_index.h @@ -16,12 +16,15 @@ #define OSS_FUZZ_INFRA_INDEXER_INDEX_IN_MEMORY_INDEX_H_ #include +#include #include #include "indexer/index/file_copier.h" #include "indexer/index/types.h" #include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" #include "absl/container/node_hash_map.h" +#include "absl/strings/string_view.h" namespace oss_fuzz { namespace indexer { @@ -36,6 +39,14 @@ class InMemoryIndex { // in the index. explicit InMemoryIndex(FileCopier& file_copier); + // Creates an `InMemoryIndex` from `flat_index`, omitting entities and + // references from translation units given by `excluded_tu_absolute_paths`; + // if the latter is provided, `flat_index` is required to contain incremental + // indexing metadata. + InMemoryIndex( + FileCopier& file_copier, const FlatIndex& flat_index, + const std::vector& excluded_tu_absolute_paths = {}); + ~InMemoryIndex(); void Merge(const InMemoryIndex& other); @@ -44,7 +55,16 @@ class InMemoryIndex { // `locations_count` new unique locations, `entities_count` new unique // entities, ... void Expand(size_t locations_count, size_t entities_count, - size_t references_count, size_t virtual_method_links_count); + size_t references_count, size_t virtual_method_links_count, + size_t translation_units_count, + size_t entity_translation_units_count, + size_t reference_translation_units_count); + + // Further `GetEntityId` / `GetReferenceId` will attribute the corresponding + // entities / references to the translation unit specified by `absolute_path` + // (note that there can be existing translation attributions for those, which + // are preserved). + void SetTranslationUnit(absl::string_view absolute_path); // The `GetXxxId` functions return the id of an existing, matching object if // there is already one in the index, or allocate a new id if there is not an @@ -72,6 +92,11 @@ class InMemoryIndex { // Like `GetLocationId`, but requires the path to be already index-adjusted. LocationId GetIdForLocationWithIndexPath(const Location& location); + // More `GetXxxId` functions that are only used internally. + TranslationUnitId GetTranslationUnitId(const TranslationUnit&); + void AddEntityTranslationUnit(const EntityTranslationUnit&); + void AddReferenceTranslationUnit(const ReferenceTranslationUnit&); + // Although we could sort location_lookup_ in advance, the performance impact // on indexing if we use a btree_map is significant, and it's much faster // to sort the index at the end. @@ -90,6 +115,14 @@ class InMemoryIndex { VirtualMethodLinkId next_virtual_method_link_id_ = 0; absl::flat_hash_map virtual_method_links_; + + TranslationUnitId next_translation_unit_id_ = 0; + absl::flat_hash_map translation_units_; + // If `kInvalidTranslationUnitId`, no TU attribution is maintained. + TranslationUnitId current_translation_unit_id_ = kInvalidTranslationUnitId; + + absl::flat_hash_set entity_translation_units_; + absl::flat_hash_set reference_translation_units_; }; } // namespace indexer diff --git a/infra/indexer/index/in_memory_index_unittest.cc b/infra/indexer/index/in_memory_index_unittest.cc index 94b89a39f284..96b2afb06de6 100644 --- a/infra/indexer/index/in_memory_index_unittest.cc +++ b/infra/indexer/index/in_memory_index_unittest.cc @@ -24,6 +24,7 @@ #include "indexer/index/file_copier.h" #include "indexer/index/types.h" +#include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/log/check.h" #include "absl/strings/match.h" @@ -87,9 +88,9 @@ std::vector GetTestEntities() { std::vector GetTestReferences() { // This should return a sorted vector of References. return EnsureSorted({ - Reference(0, 0), - Reference(0, 1), - Reference(1, 1), + Reference(/*entity_id=*/0, /*location_id=*/0), + Reference(/*entity_id=*/0, /*location_id=*/1), + Reference(/*entity_id=*/1, /*location_id=*/1), }); } @@ -266,6 +267,45 @@ TEST(InMemoryIndexTest, References) { } } +TEST(InMemoryIndexTest, TranslationUnits) { + FileCopier file_copier("", ::testing::TempDir(), {"/"}); + InMemoryIndex index(file_copier); + auto locations = GetTestLocations(); + auto entities = GetTestEntities(); + auto references = GetTestReferences(); + index.SetTranslationUnit("/a/b.cc"); + for (const auto& location : locations) { + index.GetLocationId(location); + } + for (const auto& entity : entities) { + index.GetEntityId(entity); + } + for (const auto& reference : references) { + index.GetReferenceId(reference); + } + FlatIndex flat_index = std::move(index).Export(); + ASSERT_TRUE(flat_index.incremental_indexing_metadata.has_value()); + ASSERT_EQ(flat_index.incremental_indexing_metadata->translation_units.size(), + 1); + EXPECT_EQ(flat_index.incremental_indexing_metadata->translation_units[0] + .index_path(), + "/a/b.cc"); + EXPECT_EQ( + flat_index.incremental_indexing_metadata->entity_translation_units.size(), + flat_index.entities.size()); + EXPECT_EQ(flat_index.incremental_indexing_metadata + ->reference_translation_units.size(), + flat_index.references.size()); + for (const auto& etu : + flat_index.incremental_indexing_metadata->entity_translation_units) { + EXPECT_EQ(etu.tu_id(), 0); + } + for (const auto& rtu : + flat_index.incremental_indexing_metadata->reference_translation_units) { + EXPECT_EQ(rtu.tu_id(), 0); + } +} + TEST(InMemoryIndexTest, Merge) { FileCopier file_copier("", ::testing::TempDir(), {"/"}); InMemoryIndex index_one(file_copier); @@ -320,6 +360,67 @@ TEST(InMemoryIndexTest, Merge) { } } +TEST(InMemoryIndexTest, MergeWithTranslationUnits) { + FileCopier file_copier("", ::testing::TempDir(), {"/"}); + InMemoryIndex index_one(file_copier); + InMemoryIndex index_two(file_copier); + auto locations = GetTestLocations(); + auto entities = GetTestEntities(); + auto references = GetTestReferences(); + + index_one.SetTranslationUnit("/a/1.cc"); + for (const auto& location : locations) { + index_one.GetLocationId(location); + } + for (size_t i = 0; i < entities.size(); ++i) { + if (i < 3) { + index_one.GetEntityId(entities[i]); + } + } + for (const auto& reference : references) { + index_one.GetReferenceId(reference); + } + + index_two.SetTranslationUnit("/a/2.cc"); + for (const auto& location : locations) { + index_two.GetLocationId(location); + } + for (size_t i = 0; i < entities.size(); ++i) { + index_two.GetEntityId(entities[i]); + } + + InMemoryIndex index(file_copier); + index.Merge(index_one); + index.Merge(index_two); + FlatIndex flat_index = std::move(index).Export(); + ASSERT_EQ(flat_index.locations.size(), locations.size()); + ASSERT_EQ(flat_index.entities.size(), entities.size() - 1); + ASSERT_EQ(flat_index.references.size(), references.size()); + + ASSERT_TRUE(flat_index.incremental_indexing_metadata.has_value()); + const auto& metadata = *flat_index.incremental_indexing_metadata; + ASSERT_EQ(metadata.translation_units.size(), 2); + EXPECT_EQ(metadata.translation_units[0].index_path(), "/a/1.cc"); + EXPECT_EQ(metadata.translation_units[1].index_path(), "/a/2.cc"); + + EXPECT_THAT(metadata.entity_translation_units, + ::testing::ElementsAre( + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/1, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/1, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/2, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/2, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/3, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/4, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/5, /*tu_id=*/1))); + EXPECT_THAT(metadata.reference_translation_units, + ::testing::ElementsAre( + ReferenceTranslationUnit(/*reference_id=*/0, /*tu_id=*/0), + ReferenceTranslationUnit(/*reference_id=*/1, /*tu_id=*/0), + ReferenceTranslationUnit(/*reference_id=*/2, /*tu_id=*/0))); +} + TEST(InMemoryIndexTest, MergeWithSubstituteEntities) { FileCopier file_copier("", ::testing::TempDir(), {"/"}); InMemoryIndex index_one(file_copier); @@ -394,5 +495,82 @@ TEST(InMemoryIndexTest, MergeWithSubstituteEntities) { SubstituteRelationship::Kind::kIsTemplateInstantiationOf, 5))); } } + +TEST(InMemoryIndexTest, ConstructWithExcludedTranslationUnits) { + auto tmp_dir_path = std::filesystem::path(::testing::TempDir()); + auto loc0_path = tmp_dir_path / "a/b.cc"; + auto loc1_path = tmp_dir_path / "c/d.h"; + PopulateLocationFiles( + {Location(loc0_path.string(), 1, 2), Location(loc1_path.string(), 3, 4)}, + tmp_dir_path); + + FlatIndex flat_index; + flat_index.locations = { + Location(loc0_path.string(), 1, 2), + Location(loc1_path.string(), 3, 4), + }; + flat_index.entities = { + Entity(Entity::Kind::kEnumConstant, "", "kEnumValue", "", 1, false, false, + std::nullopt, "123"), + Entity(Entity::Kind::kClass, "foo::", "Bar", "", 0), + Entity(Entity::Kind::kFunction, "foo::", "Bar", "()", 1, false, false, + std::nullopt, std::nullopt, + Entity::VirtualMethodKind::kPureVirtual), + }; + flat_index.references = { + Reference(/*entity_id=*/0, /*location_id=*/1), + Reference(/*entity_id=*/1, /*location_id=*/0), + }; + flat_index.virtual_method_links = { + VirtualMethodLink(2, 2), + }; + flat_index.incremental_indexing_metadata.emplace(); + flat_index.incremental_indexing_metadata->translation_units = { + TranslationUnit("/tu1"), + TranslationUnit("/tu2"), + }; + flat_index.incremental_indexing_metadata->entity_translation_units = { + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/1, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/2, /*tu_id=*/1), + }; + flat_index.incremental_indexing_metadata->reference_translation_units = { + ReferenceTranslationUnit(/*reference_id=*/0, /*tu_id=*/1), + ReferenceTranslationUnit(/*reference_id=*/1, /*tu_id=*/0), + ReferenceTranslationUnit(/*reference_id=*/1, /*tu_id=*/1), + }; + + FileCopier file_copier(/*base_path=*/"", /*index_path=*/::testing::TempDir(), + /*extra_paths=*/{"/"}); + InMemoryIndex index(file_copier, flat_index, + /*excluded_tu_absolute_paths=*/{"/tu1"}); + + FlatIndex result = std::move(index).Export(); + + EXPECT_THAT( + result.locations, + ::testing::ElementsAre(flat_index.locations[0], flat_index.locations[1])); + EXPECT_THAT(result.entities, ::testing::ElementsAre(flat_index.entities[0], + flat_index.entities[1], + flat_index.entities[2])); + EXPECT_THAT(result.references, + ::testing::ElementsAre(flat_index.references[0], + flat_index.references[1])); + EXPECT_THAT(result.virtual_method_links, + ::testing::ElementsAre(VirtualMethodLink(2, 2))); + ASSERT_TRUE(result.incremental_indexing_metadata.has_value()); + EXPECT_THAT(result.incremental_indexing_metadata->translation_units, + ::testing::ElementsAre(TranslationUnit("/tu2"))); + EXPECT_THAT(result.incremental_indexing_metadata->entity_translation_units, + ::testing::ElementsAre( + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/1, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/2, /*tu_id=*/0))); + EXPECT_THAT(result.incremental_indexing_metadata->reference_translation_units, + ::testing::ElementsAre( + ReferenceTranslationUnit(/*reference_id=*/0, /*tu_id=*/0), + ReferenceTranslationUnit(/*reference_id=*/1, /*tu_id=*/0))); +} } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/index/sqlite.cc b/infra/indexer/index/sqlite.cc index 3169f6fef0eb..5cae35a8ca6a 100644 --- a/infra/indexer/index/sqlite.cc +++ b/infra/indexer/index/sqlite.cc @@ -16,8 +16,10 @@ #include #include // NOLINT +#include #include #include +#include #include "indexer/index/types.h" #include "absl/cleanup/cleanup.h" @@ -30,6 +32,13 @@ namespace oss_fuzz { namespace indexer { namespace { + +// Note: We could in principle enforce UNIQUE constraints on `reference` foreign +// key pairs, as well as those of `virtual_method_link` and +// `entity_translation_unit` (as an extreme, non-ID fields of e.g. `location` +// could also be made into a UNIQUE tuple). But those are unique by construction +// now and we hope to avoid the overhead of checking those constraints. + const char kCreateDb[] = "PRAGMA foreign_keys = ON;\n" "PRAGMA user_version = " SCHEMA_VERSION @@ -87,11 +96,33 @@ const char kCreateDb[] = "CREATE INDEX virtual_method_link_parent ON virtual_method_link(" " parent_entity_id);\n"; +const char kCreateIncrementalIndexingSupportTables[] = + "CREATE TABLE translation_unit(\n" + " id INTEGER PRIMARY KEY,\n" + " path TEXT);\n" + "\n" + "CREATE TABLE entity_translation_unit(\n" + " id INTEGER PRIMARY KEY,\n" + " entity_id INT NOT NULL,\n" + " tu_id INT NOT NULL,\n" + " FOREIGN KEY (entity_id) REFERENCES entity(id),\n" + " FOREIGN KEY (tu_id) REFERENCES translation_unit(id));\n" + "\n" + "CREATE TABLE reference_translation_unit(\n" + " id INTEGER PRIMARY KEY,\n" + " reference_id INT NOT NULL,\n" + " tu_id INT NOT NULL,\n" + " FOREIGN KEY (reference_id) REFERENCES reference(id),\n" + " FOREIGN KEY (tu_id) REFERENCES translation_unit(id));\n"; + const char kInsertLocation[] = "INSERT INTO location\n" " (id, dirname, basename, start_line, end_line)\n" " VALUES (?1, ?2, ?3, ?4, ?5);"; +const char kSelectLocations[] = + "SELECT dirname, basename, start_line, end_line FROM location ORDER BY id;"; + const char kInsertEntity[] = "INSERT INTO entity\n" " (id, kind, is_incomplete, name_prefix, name, name_suffix, location_id,\n" @@ -99,16 +130,55 @@ const char kInsertEntity[] = " virtual_method_kind)\n" " VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11);"; +const char kSelectEntities[] = + "SELECT kind, is_incomplete, name_prefix, name, name_suffix, " + " location_id, substitute_entity_id, substitute_relationship_kind, " + " enum_value, virtual_method_kind\n" + " FROM entity\n" + " ORDER BY id;"; + const char kInsertReference[] = "INSERT INTO reference\n" " (id, entity_id, location_id)\n" " VALUES (?1, ?2, ?3);"; +const char kSelectReferences[] = + "SELECT entity_id, location_id FROM reference ORDER BY id;"; + const char kInsertLink[] = "INSERT INTO virtual_method_link\n" " (id, parent_entity_id, child_entity_id)\n" " VALUES (?1, ?2, ?3);"; +const char kSelectLinks[] = + "SELECT parent_entity_id, child_entity_id\n" + " FROM virtual_method_link\n" + " ORDER BY id;"; + +const char kInsertTranslationUnit[] = + "INSERT INTO translation_unit\n" + " (id, path)\n" + " VALUES (?1, ?2);"; + +const char kSelectTranslationUnits[] = + "SELECT path FROM translation_unit ORDER BY id;"; + +const char kInsertEntityTranslationUnit[] = + "INSERT INTO entity_translation_unit\n" + " (id, entity_id, tu_id)\n" + " VALUES (?1, ?2, ?3);"; + +const char kSelectEntityTranslationUnits[] = + "SELECT entity_id, tu_id FROM entity_translation_unit ORDER BY id;"; + +const char kInsertReferenceTranslationUnit[] = + "INSERT INTO reference_translation_unit\n" + " (id, reference_id, tu_id)\n" + " VALUES (?1, ?2, ?3);"; + +const char kSelectReferenceTranslationUnits[] = + "SELECT reference_id, tu_id FROM reference_translation_unit ORDER BY id;"; + const char kFinalizeDb[] = "VACUUM;\n" "REINDEX;\n" @@ -163,8 +233,8 @@ bool InsertLocations(sqlite3* db, absl::Span locations) { bool InsertEntities(sqlite3* db, absl::Span entities) { // `substitute_entity_id` foreign key can refer to a yet-unadded entity. - if (sqlite3_exec(db, "PRAGMA foreign_keys = OFF;", nullptr, - nullptr, nullptr) != SQLITE_OK) { + if (sqlite3_exec(db, "PRAGMA foreign_keys = OFF;", nullptr, nullptr, + nullptr) != SQLITE_OK) { LOG(ERROR) << "sqlite disabling foreign keys failed: `" << sqlite3_errmsg(db) << "`"; return false; @@ -261,8 +331,8 @@ bool InsertEntities(sqlite3* db, absl::Span entities) { std::move(cleanup).Cancel(); sqlite3_finalize(insert_entity); - if (sqlite3_exec(db, "PRAGMA foreign_keys = ON;", nullptr, nullptr, nullptr) - != SQLITE_OK) { + if (sqlite3_exec(db, "PRAGMA foreign_keys = ON;", nullptr, nullptr, + nullptr) != SQLITE_OK) { LOG(ERROR) << "sqlite re-enabling foreign keys failed: `" << sqlite3_errmsg(db) << "`"; return false; @@ -320,7 +390,7 @@ bool InsertVirtualMethodLinks(sqlite3* db, return false; } - for (ReferenceId i = 0; i < links.size(); ++i) { + for (VirtualMethodLinkId i = 0; i < links.size(); ++i) { const VirtualMethodLink& link = links[i]; if (sqlite3_bind_int64(insert_link, 1, i) != SQLITE_OK || sqlite3_bind_int64(insert_link, 2, link.parent()) != SQLITE_OK || @@ -332,7 +402,7 @@ bool InsertVirtualMethodLinks(sqlite3* db, } if (sqlite3_step(insert_link) != SQLITE_DONE) { - LOG(ERROR) << "sqlite executing insert_reference failed: `" + LOG(ERROR) << "sqlite executing insert_link failed: `" << sqlite3_errmsg(db) << "`"; sqlite3_finalize(insert_link); return false; @@ -345,10 +415,411 @@ bool InsertVirtualMethodLinks(sqlite3* db, sqlite3_finalize(insert_link); return true; } + +bool InsertTranslationUnits( + sqlite3* db, absl::Span translation_units) { + sqlite3_stmt* insert_tu = nullptr; + if (sqlite3_prepare_v2(db, kInsertTranslationUnit, + sizeof(kInsertTranslationUnit), &insert_tu, + nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + for (TranslationUnitId i = 0; i < translation_units.size(); ++i) { + const TranslationUnit& tu = translation_units[i]; + if (sqlite3_bind_int64(insert_tu, 1, i) != SQLITE_OK || + sqlite3_bind_text(insert_tu, 2, tu.index_path().data(), + tu.index_path().size(), SQLITE_STATIC) != SQLITE_OK) { + LOG(ERROR) << "sqlite binding insert_tu failed: `" << sqlite3_errmsg(db) + << "`"; + sqlite3_finalize(insert_tu); + return false; + } + + if (sqlite3_step(insert_tu) != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing insert_tu failed: `" << sqlite3_errmsg(db) + << "`"; + sqlite3_finalize(insert_tu); + return false; + } + + sqlite3_reset(insert_tu); + sqlite3_clear_bindings(insert_tu); + } + + sqlite3_finalize(insert_tu); + return true; +} + +bool InsertEntityTranslationUnits( + sqlite3* db, + absl::Span entity_translation_units) { + sqlite3_stmt* insert_entity_tu = nullptr; + if (sqlite3_prepare_v2(db, kInsertEntityTranslationUnit, + sizeof(kInsertEntityTranslationUnit), + &insert_entity_tu, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + for (EntityTranslationUnitId i = 0; i < entity_translation_units.size(); + ++i) { + const EntityTranslationUnit& entity_tu = entity_translation_units[i]; + if (sqlite3_bind_int64(insert_entity_tu, 1, i) != SQLITE_OK || + sqlite3_bind_int64(insert_entity_tu, 2, entity_tu.entity_id()) != + SQLITE_OK || + sqlite3_bind_int64(insert_entity_tu, 3, entity_tu.tu_id()) != + SQLITE_OK) { + LOG(ERROR) << "sqlite binding insert_entity_tu failed: `" + << sqlite3_errmsg(db) << "`"; + sqlite3_finalize(insert_entity_tu); + return false; + } + + if (sqlite3_step(insert_entity_tu) != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing insert_entity_tu failed: `" + << sqlite3_errmsg(db) << "`"; + sqlite3_finalize(insert_entity_tu); + return false; + } + + sqlite3_reset(insert_entity_tu); + sqlite3_clear_bindings(insert_entity_tu); + } + + sqlite3_finalize(insert_entity_tu); + return true; +} + +bool InsertReferenceTranslationUnits( + sqlite3* db, + absl::Span reference_translation_units) { + sqlite3_stmt* insert_reference_tu = nullptr; + if (sqlite3_prepare_v2(db, kInsertReferenceTranslationUnit, + sizeof(kInsertReferenceTranslationUnit), + &insert_reference_tu, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + for (ReferenceTranslationUnitId i = 0; i < reference_translation_units.size(); + ++i) { + const ReferenceTranslationUnit& reference_tu = + reference_translation_units[i]; + if (sqlite3_bind_int64(insert_reference_tu, 1, i) != SQLITE_OK || + sqlite3_bind_int64(insert_reference_tu, 2, + reference_tu.reference_id()) != SQLITE_OK || + sqlite3_bind_int64(insert_reference_tu, 3, reference_tu.tu_id()) != + SQLITE_OK) { + LOG(ERROR) << "sqlite binding insert_reference_tu failed: `" + << sqlite3_errmsg(db) << "`"; + sqlite3_finalize(insert_reference_tu); + return false; + } + + if (sqlite3_step(insert_reference_tu) != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing insert_reference_tu failed: `" + << sqlite3_errmsg(db) << "`"; + sqlite3_finalize(insert_reference_tu); + return false; + } + + sqlite3_reset(insert_reference_tu); + sqlite3_clear_bindings(insert_reference_tu); + } + + sqlite3_finalize(insert_reference_tu); + return true; +} + +// Returns text column `column` of `stmt` as an optional string. Returns +// `nullopt` if the column value is `NULL`. +std::optional OptionalColumnText(sqlite3_stmt* stmt, int column) { + const char* text = + reinterpret_cast(sqlite3_column_text(stmt, column)); + if (text) { + return std::string(text); + } + return std::nullopt; +} + +// Returns text column `column` of `stmt` as a string. Returns an empty string +// if the column value is `NULL`. +std::string ColumnText(sqlite3_stmt* stmt, int column) { + return OptionalColumnText(stmt, column).value_or(std::string()); +} + +bool ReadLocations(sqlite3* db, std::vector& locations) { + locations.clear(); + + sqlite3_stmt* select_locations = nullptr; + if (sqlite3_prepare_v2(db, kSelectLocations, sizeof(kSelectLocations), + &select_locations, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_locations] { + sqlite3_finalize(select_locations); + }; + + int code; + while ((code = sqlite3_step(select_locations)) == SQLITE_ROW) { + std::string dirname = ColumnText(select_locations, 0); + std::string basename = ColumnText(select_locations, 1); + std::filesystem::path path = std::filesystem::path(dirname) / basename; + locations.emplace_back(path.string(), + sqlite3_column_int(select_locations, 2), + sqlite3_column_int(select_locations, 3)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_locations failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool ReadEntities(sqlite3* db, std::vector& entities) { + entities.clear(); + + sqlite3_stmt* select_entities = nullptr; + if (sqlite3_prepare_v2(db, kSelectEntities, sizeof(kSelectEntities), + &select_entities, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_entities] { + sqlite3_finalize(select_entities); + }; + + int code; + while ((code = sqlite3_step(select_entities)) == SQLITE_ROW) { + Entity::Kind kind = + static_cast(sqlite3_column_int(select_entities, 0)); + bool is_incomplete = sqlite3_column_int(select_entities, 1); + std::string name_prefix = ColumnText(select_entities, 2); + std::string name = ColumnText(select_entities, 3); + std::string name_suffix = ColumnText(select_entities, 4); + LocationId location_id = sqlite3_column_int64(select_entities, 5); + + std::optional substitute_relationship; + if (sqlite3_column_type(select_entities, 6) != SQLITE_NULL) { + EntityId substitute_entity_id = sqlite3_column_int64(select_entities, 6); + SubstituteRelationship::Kind substitute_relationship_kind = + static_cast( + sqlite3_column_int(select_entities, 7)); + substitute_relationship.emplace(substitute_relationship_kind, + substitute_entity_id); + } + + std::optional enum_value = + OptionalColumnText(select_entities, 8); + Entity::VirtualMethodKind virtual_method_kind = + static_cast( + sqlite3_column_int(select_entities, 9)); + + entities.emplace_back(kind, name_prefix, name, name_suffix, location_id, + is_incomplete, /*is_weak=*/false, + substitute_relationship, enum_value, + virtual_method_kind); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_entities failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool ReadReferences(sqlite3* db, std::vector& references) { + references.clear(); + + sqlite3_stmt* select_references = nullptr; + if (sqlite3_prepare_v2(db, kSelectReferences, sizeof(kSelectReferences), + &select_references, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_references] { + sqlite3_finalize(select_references); + }; + + int code; + while ((code = sqlite3_step(select_references)) == SQLITE_ROW) { + references.emplace_back(sqlite3_column_int64(select_references, 0), + sqlite3_column_int64(select_references, 1)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_references failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool ReadVirtualMethodLinks(sqlite3* db, + std::vector& links) { + links.clear(); + + sqlite3_stmt* select_links = nullptr; + if (sqlite3_prepare_v2(db, kSelectLinks, sizeof(kSelectLinks), &select_links, + nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_links] { sqlite3_finalize(select_links); }; + + int code; + while ((code = sqlite3_step(select_links)) == SQLITE_ROW) { + links.emplace_back(sqlite3_column_int64(select_links, 0), + sqlite3_column_int64(select_links, 1)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_links failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool ReadTranslationUnits(sqlite3* db, + std::vector& translation_units) { + translation_units.clear(); + + sqlite3_stmt* select_tus = nullptr; + if (sqlite3_prepare_v2(db, kSelectTranslationUnits, + sizeof(kSelectTranslationUnits), &select_tus, + nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_tus] { sqlite3_finalize(select_tus); }; + + int code; + while ((code = sqlite3_step(select_tus)) == SQLITE_ROW) { + translation_units.emplace_back(ColumnText(select_tus, 0)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_tus failed: `" << sqlite3_errmsg(db) + << "`"; + return false; + } + + return true; +} + +bool ReadEntityTranslationUnits( + sqlite3* db, std::vector& entity_translation_units) { + entity_translation_units.clear(); + + sqlite3_stmt* select_entity_tus = nullptr; + if (sqlite3_prepare_v2(db, kSelectEntityTranslationUnits, + sizeof(kSelectEntityTranslationUnits), + &select_entity_tus, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_entity_tus] { + sqlite3_finalize(select_entity_tus); + }; + + int code; + while ((code = sqlite3_step(select_entity_tus)) == SQLITE_ROW) { + entity_translation_units.emplace_back( + sqlite3_column_int64(select_entity_tus, 0), + sqlite3_column_int64(select_entity_tus, 1)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_entity_tus failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool ReadReferenceTranslationUnits( + sqlite3* db, + std::vector& reference_translation_units) { + reference_translation_units.clear(); + + sqlite3_stmt* select_reference_tus = nullptr; + if (sqlite3_prepare_v2(db, kSelectReferenceTranslationUnits, + sizeof(kSelectReferenceTranslationUnits), + &select_reference_tus, nullptr) != SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [select_reference_tus] { + sqlite3_finalize(select_reference_tus); + }; + + int code; + while ((code = sqlite3_step(select_reference_tus)) == SQLITE_ROW) { + reference_translation_units.emplace_back( + sqlite3_column_int64(select_reference_tus, 0), + sqlite3_column_int64(select_reference_tus, 1)); + } + + if (code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select_reference_tus failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + + return true; +} + +bool TableExists(sqlite3* db, const char* table_name) { + sqlite3_stmt* stmt = nullptr; + const char query[] = + "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1;"; + if (sqlite3_prepare_v2(db, query, sizeof(query), &stmt, nullptr) != + SQLITE_OK) { + LOG(ERROR) << "sqlite compiling prepared statement failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + absl::Cleanup cleanup = [stmt] { sqlite3_finalize(stmt); }; + if (sqlite3_bind_text(stmt, 1, table_name, -1, SQLITE_STATIC) != SQLITE_OK) { + LOG(ERROR) << "sqlite binding table_name failed: `" << sqlite3_errmsg(db) + << "`"; + return false; + } + + int code = sqlite3_step(stmt); + if (code != SQLITE_ROW && code != SQLITE_DONE) { + LOG(ERROR) << "sqlite executing select from `sqlite_master` failed: `" + << sqlite3_errmsg(db) << "`"; + return false; + } + return code == SQLITE_ROW; +} + } // anonymous namespace -bool SaveAsSqlite(const FlatIndex& index, const std::string& path) { - LOG(INFO) << "creating in-memory database"; +bool InitializeSqlite() { const size_t kSqliteMmapSize = 0x1000000000ull; if (sqlite3_config(SQLITE_CONFIG_SINGLETHREAD) != SQLITE_OK || sqlite3_config(SQLITE_CONFIG_MMAP_SIZE, kSqliteMmapSize, @@ -357,7 +828,71 @@ bool SaveAsSqlite(const FlatIndex& index, const std::string& path) { LOG(ERROR) << "sqlite setup failed"; return false; } + return true; +} +std::optional LoadFromSqlite(const std::string& path) { + sqlite3* db = nullptr; + if (sqlite3_open_v2(path.c_str(), &db, SQLITE_OPEN_READONLY, nullptr) != + SQLITE_OK) { + LOG(ERROR) << "sqlite open database failed: `" << sqlite3_errmsg(db) << "`"; + sqlite3_close(db); + return std::nullopt; + } + absl::Cleanup db_cleanup = [db] { sqlite3_close(db); }; + + FlatIndex index; + + LOG(INFO) << "reading locations"; + if (!ReadLocations(db, /*out*/ index.locations)) { + return std::nullopt; + } + + LOG(INFO) << "reading entities"; + if (!ReadEntities(db, /*out*/ index.entities)) { + return std::nullopt; + } + + LOG(INFO) << "reading references"; + if (!ReadReferences(db, /*out*/ index.references)) { + return std::nullopt; + } + + LOG(INFO) << "reading virtual method links"; + if (!ReadVirtualMethodLinks(db, /*out*/ index.virtual_method_links)) { + return std::nullopt; + } + + if (TableExists(db, "translation_unit")) { + LOG(INFO) << "reading translation units"; + index.incremental_indexing_metadata.emplace(); + if (!ReadTranslationUnits( + db, + /*out*/ index.incremental_indexing_metadata->translation_units)) { + return std::nullopt; + } + + LOG(INFO) << "reading entity - translation unit pairs"; + if (!ReadEntityTranslationUnits(db, + /*out*/ index.incremental_indexing_metadata + ->entity_translation_units)) { + return std::nullopt; + } + + LOG(INFO) << "reading reference - translation unit pairs"; + if (!ReadReferenceTranslationUnits( + db, + /*out*/ index.incremental_indexing_metadata + ->reference_translation_units)) { + return std::nullopt; + } + } + + return index; +} + +bool SaveAsSqlite(const FlatIndex& index, const std::string& path) { + LOG(INFO) << "creating in-memory database"; sqlite3* db = nullptr; char* error = nullptr; if (sqlite3_open(":memory:", &db) != SQLITE_OK || @@ -391,6 +926,39 @@ bool SaveAsSqlite(const FlatIndex& index, const std::string& path) { return false; } + if (index.incremental_indexing_metadata.has_value()) { + const IncrementalIndexingMetadata& metadata = + *index.incremental_indexing_metadata; + + LOG(INFO) << "creating incremental indexing support tables"; + if (sqlite3_exec(db, kCreateIncrementalIndexingSupportTables, nullptr, + nullptr, &error) != SQLITE_OK) { + LOG(ERROR) << "incremental indexing support table creation failed: `" + << error << "`"; + sqlite3_close(db); + return false; + } + + LOG(INFO) << "inserting translation units"; + if (!InsertTranslationUnits(db, metadata.translation_units)) { + sqlite3_close(db); + return false; + } + + LOG(INFO) << "inserting entity - translation unit pairs"; + if (!InsertEntityTranslationUnits(db, metadata.entity_translation_units)) { + sqlite3_close(db); + return false; + } + + LOG(INFO) << "inserting reference - translation unit pairs"; + if (!InsertReferenceTranslationUnits( + db, metadata.reference_translation_units)) { + sqlite3_close(db); + return false; + } + } + LOG(INFO) << "finalizing database"; if (sqlite3_exec(db, kFinalizeDb, nullptr, nullptr, &error) != SQLITE_OK) { LOG(ERROR) << "database finalization failed: `" << error << "`"; @@ -427,5 +995,6 @@ bool SaveAsSqlite(const FlatIndex& index, const std::string& path) { sqlite3_close(db); return backup_success; } + } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/index/sqlite.h b/infra/indexer/index/sqlite.h index 563075e111cd..ea7ad5d7c409 100644 --- a/infra/indexer/index/sqlite.h +++ b/infra/indexer/index/sqlite.h @@ -15,13 +15,18 @@ #ifndef OSS_FUZZ_INFRA_INDEXER_INDEX_SQLITE_H_ #define OSS_FUZZ_INFRA_INDEXER_INDEX_SQLITE_H_ +#include #include #include "indexer/index/types.h" namespace oss_fuzz { namespace indexer { + +bool InitializeSqlite(); +std::optional LoadFromSqlite(const std::string& path); bool SaveAsSqlite(const FlatIndex& index, const std::string& path); + } // namespace indexer } // namespace oss_fuzz diff --git a/infra/indexer/index/sqlite_unittest.cc b/infra/indexer/index/sqlite_unittest.cc new file mode 100644 index 000000000000..06d5d3eef351 --- /dev/null +++ b/infra/indexer/index/sqlite_unittest.cc @@ -0,0 +1,100 @@ +// Copyright 2025 Google LLC. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "indexer/index/sqlite.h" + +#include // NOLINT +#include +#include + +#include "indexer/index/types.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace oss_fuzz { +namespace indexer { +namespace { + +using ::testing::ElementsAreArray; + +class SqliteTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { ASSERT_TRUE(InitializeSqlite()); } +}; + +TEST_F(SqliteTest, SaveAndLoad) { + FlatIndex index; + index.locations = { + Location("a/b.cc", 1, 2), + Location("c/d.h", 3, 4), + }; + index.entities = { + Entity(Entity::Kind::kEnumConstant, "", "kEnumValue", "", 1, false, false, + std::nullopt, "123"), + Entity(Entity::Kind::kClass, "foo::", "Bar", "", 0), + Entity(Entity::Kind::kFunction, "foo::", "Bar", "()", 1, false, false, + std::nullopt, std::nullopt, + Entity::VirtualMethodKind::kPureVirtual), + }; + index.references = { + Reference(/*entity_id=*/0, /*location_id=*/1), + Reference(/*entity_id=*/1, /*location_id=*/0), + }; + index.virtual_method_links = { + VirtualMethodLink(2, 2), + }; + index.incremental_indexing_metadata.emplace(); + index.incremental_indexing_metadata->translation_units = { + TranslationUnit("tu1"), + TranslationUnit("tu2"), + }; + index.incremental_indexing_metadata->entity_translation_units = { + EntityTranslationUnit(/*entity_id=*/0, /*tu_id=*/0), + EntityTranslationUnit(/*entity_id=*/1, /*tu_id=*/1), + EntityTranslationUnit(/*entity_id=*/2, /*tu_id=*/1), + }; + index.incremental_indexing_metadata->reference_translation_units = { + ReferenceTranslationUnit(/*reference_id=*/0, /*tu_id=*/1), + ReferenceTranslationUnit(/*reference_id=*/1, /*tu_id=*/0), + }; + + const std::string path = + (std::filesystem::path(::testing::TempDir()) / "test.sqlite").string(); + + ASSERT_TRUE(SaveAsSqlite(index, path)); + std::optional loaded_index = LoadFromSqlite(path); + ASSERT_TRUE(loaded_index.has_value()); + + EXPECT_THAT(loaded_index->locations, ElementsAreArray(index.locations)); + EXPECT_THAT(loaded_index->entities, ElementsAreArray(index.entities)); + EXPECT_THAT(loaded_index->references, ElementsAreArray(index.references)); + EXPECT_THAT(loaded_index->virtual_method_links, + ElementsAreArray(index.virtual_method_links)); + ASSERT_TRUE(loaded_index->incremental_indexing_metadata.has_value()); + EXPECT_THAT( + loaded_index->incremental_indexing_metadata->translation_units, + ElementsAreArray(index.incremental_indexing_metadata->translation_units)); + EXPECT_THAT( + loaded_index->incremental_indexing_metadata->entity_translation_units, + ElementsAreArray( + index.incremental_indexing_metadata->entity_translation_units)); + EXPECT_THAT( + loaded_index->incremental_indexing_metadata->reference_translation_units, + ElementsAreArray( + index.incremental_indexing_metadata->reference_translation_units)); +} + +} // namespace +} // namespace indexer +} // namespace oss_fuzz diff --git a/infra/indexer/index/types.h b/infra/indexer/index/types.h index 0af0067ca4ed..5481548f4731 100644 --- a/infra/indexer/index/types.h +++ b/infra/indexer/index/types.h @@ -40,8 +40,13 @@ using LocationId = uint64_t; using EntityId = uint64_t; using ReferenceId = uint64_t; using VirtualMethodLinkId = uint64_t; +using TranslationUnitId = uint64_t; +using EntityTranslationUnitId = uint64_t; +using ReferenceTranslationUnitId = uint64_t; constexpr LocationId kInvalidLocationId = 0xffffffffffffffffull; constexpr EntityId kInvalidEntityId = 0xffffffffffffffffull; +constexpr ReferenceId kInvalidReferenceId = 0xffffffffffffffffull; +constexpr TranslationUnitId kInvalidTranslationUnitId = 0xffffffffffffffffull; inline bool IsRealPath(absl::string_view path) { // Examples of built-in paths: `` and ``. @@ -180,6 +185,7 @@ class Entity { CHECK_EQ(substitute_relationship_.has_value(), new_substitute_entity_id.has_value()); if (substitute_relationship_.has_value()) { + CHECK_NE(*new_substitute_entity_id, kInvalidEntityId); substitute_relationship_->entity_id_ = *new_substitute_entity_id; } } @@ -343,6 +349,81 @@ H AbslHashValue(H h, const VirtualMethodLink& link) { return H::combine(std::move(h), link.parent(), link.child()); } +// Represents a single translation unit. +class TranslationUnit { + public: + explicit TranslationUnit(const std::string& index_path) + : index_path_(index_path) {} + + const std::string& index_path() const { return index_path_; } + + bool operator==(const TranslationUnit&) const = default; + std::strong_ordering operator<=>(const TranslationUnit&) const = default; + + private: + std::string index_path_; +}; + +template +H AbslHashValue(H h, const TranslationUnit& tu) { + return H::combine(std::move(h), tu.index_path()); +} + +// Links an entity to a translation unit it is encountered in (many-to-many). +class EntityTranslationUnit { + public: + EntityTranslationUnit(EntityId entity_id, TranslationUnitId tu_id) + : entity_id_(entity_id), tu_id_(tu_id) { + CHECK_NE(entity_id, kInvalidEntityId); + } + + EntityId entity_id() const { return entity_id_; } + TranslationUnitId tu_id() const { return tu_id_; } + + bool operator==(const EntityTranslationUnit&) const = default; + std::strong_ordering operator<=>(const EntityTranslationUnit&) const = + default; + + private: + EntityId entity_id_; + TranslationUnitId tu_id_; +}; + +template +H AbslHashValue(H h, const EntityTranslationUnit& etu) { + return H::combine(std::move(h), etu.entity_id(), etu.tu_id()); +} + +// Links a reference to a translation unit it is encountered in (many-to-many). +class ReferenceTranslationUnit { + public: + ReferenceTranslationUnit(ReferenceId reference_id, TranslationUnitId tu_id) + : reference_id_(reference_id), tu_id_(tu_id) {} + + ReferenceId reference_id() const { return reference_id_; } + TranslationUnitId tu_id() const { return tu_id_; } + + bool operator==(const ReferenceTranslationUnit&) const = default; + std::strong_ordering operator<=>(const ReferenceTranslationUnit&) const = + default; + + private: + ReferenceId reference_id_; + TranslationUnitId tu_id_; +}; + +template +H AbslHashValue(H h, const ReferenceTranslationUnit& etu) { + return H::combine(std::move(h), etu.reference_id(), etu.tu_id()); +} + +// A set of optional metadata for incremental indexing support. +struct IncrementalIndexingMetadata { + std::vector translation_units; + std::vector entity_translation_units; + std::vector reference_translation_units; +}; + // A simple holder for a sorted index, used as an interchange format/interface // definition between uses of the index. struct FlatIndex { @@ -350,6 +431,7 @@ struct FlatIndex { std::vector entities; std::vector references; std::vector virtual_method_links; + std::optional incremental_indexing_metadata; }; namespace testing_internal {