diff --git a/include/phasar/PhasarLLVM/Utils/LLVMIRToSrc.h b/include/phasar/PhasarLLVM/Utils/LLVMIRToSrc.h index f65f07239e..06fa418a0d 100644 --- a/include/phasar/PhasarLLVM/Utils/LLVMIRToSrc.h +++ b/include/phasar/PhasarLLVM/Utils/LLVMIRToSrc.h @@ -123,6 +123,19 @@ void to_json(nlohmann::json &J, const SourceCodeInfo &Info); [[nodiscard]] std::optional getDebugLocation(const llvm::Value *V); +/// Call \p Callback(DILocalVariable *, Value *) for every dbg.declare record +/// attached to \p I. Handles both the legacy intrinsic form and the +/// DbgVariableRecord form introduced in LLVM 19. +void forEachDbgDeclare( + const llvm::Instruction &I, + llvm::function_ref + Callback); + +/// Strip DW_TAG_typedef chains from \p Ty and return the underlying +/// DICompositeType if it is a DW_TAG_structure_type; return null otherwise. +[[nodiscard]] const llvm::DICompositeType * +stripTypedefsToStruct(const llvm::DIType *Ty); + } // namespace psr #endif diff --git a/lib/PhasarLLVM/TaintConfig/LLVMTaintConfig.cpp b/lib/PhasarLLVM/TaintConfig/LLVMTaintConfig.cpp index 0d8e23b656..bb15ac4686 100644 --- a/lib/PhasarLLVM/TaintConfig/LLVMTaintConfig.cpp +++ b/lib/PhasarLLVM/TaintConfig/LLVMTaintConfig.cpp @@ -14,16 +14,17 @@ #include "phasar/PhasarLLVM/TaintConfig/TaintConfigBase.h" #include "phasar/PhasarLLVM/TaintConfig/TaintConfigData.h" #include "phasar/PhasarLLVM/Utils/Annotation.h" +#include "phasar/PhasarLLVM/Utils/LLVMIRToSrc.h" #include "phasar/PhasarLLVM/Utils/LLVMShorthands.h" #include "phasar/Utils/EnumFlags.h" #include "phasar/Utils/Logger.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/ADT/StringMap.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include @@ -127,64 +128,61 @@ void LLVMTaintConfig::addAllFunctions(const LLVMProjectIRDB &IRDB, LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &Code, const TaintConfigData &Config) { - // handle functions addAllFunctions(Code, Config); - // handle variables - // scope can be a function name or a struct. - std::unordered_map StructConfigMap; + if (Config.Variables.empty()) { + return; + } - // read all struct types from config - llvm::DebugInfoFinder DIF; - const auto *M = Code.getModule(); + // Map LLVM struct type → configured field name, built by correlating + // debug-declare records with allocas (avoids IR type naming assumptions). + std::unordered_map StructConfigMap; - DIF.processModule(*M); - for (const auto &VarDesc : Config.Variables) { - for (const auto &Ty : DIF.types()) { - if (Ty->getTag() == llvm::dwarf::DW_TAG_structure_type && - Ty->getName() == VarDesc.Scope) { - // TODO: Below loop looks wrong! Why on earth are we looping over the - // SAME set of types every time? - for (const auto &LlvmStructTy : M->getIdentifiedStructTypes()) { - StructConfigMap.insert( - std::pair(LlvmStructTy, - VarDesc.Name)); - } + { + // Pre-build scope → field name for O(1) lookup during the alloca scan. + llvm::StringMap StructScopeIndex; + for (const auto &VarDesc : Config.Variables) { + StructScopeIndex.try_emplace(VarDesc.Scope, VarDesc.Name); + } + + for (const auto *Fun : Code.getAllFunctions()) { + for (const auto &I : llvm::instructions(Fun)) { + forEachDbgDeclare(I, [&](const llvm::DILocalVariable *LocalVar, + const llvm::Value *Addr) { + const auto *AllocaI = llvm::dyn_cast(Addr); + if (!AllocaI) { + return; + } + const auto *StructTy = + llvm::dyn_cast(AllocaI->getAllocatedType()); + if (!StructTy) { + return; + } + const auto *CT = stripTypedefsToStruct(LocalVar->getType()); + if (!CT) { + return; + } + if (auto It = StructScopeIndex.find(CT->getName()); + It != StructScopeIndex.end()) { + StructConfigMap.try_emplace(StructTy, It->second); + } + }); } } } - // add corresponding Allocas or getElementPtr instructions to the taint - // category - const auto AddVariable = [this](const VariableData &VarDesc, - const llvm::Instruction &I) -> bool { -#if LLVM_VERSION_MAJOR > 18 - for (const llvm::DbgVariableRecord &DbR : - llvm::filterDbgVars(I.getDbgRecordRange())) { - const auto *Var = DbR.getVariable(); - assert(Var != nullptr); - if (Var->getName() == VarDesc.Name && Var->getLine() == VarDesc.Line) { - addTaintCategory(DbR.getAddress(), VarDesc.Cat); - return true; - } - } -#endif - - if (const auto *DbgDeclare = llvm::dyn_cast(&I)) { - const llvm::DILocalVariable *LocalVar = DbgDeclare->getVariable(); - // matching line number with for Allocas - if (LocalVar->getName() == VarDesc.Name && - LocalVar->getLine() == VarDesc.Line) { - addTaintCategory(DbgDeclare->getAddress(), VarDesc.Cat); - return true; + const llvm::Instruction &I) { + forEachDbgDeclare(I, [&](const llvm::DILocalVariable *Var, + const llvm::Value *Addr) { + if (Var->getLine() == VarDesc.Line && Var->getName() == VarDesc.Name) { + addTaintCategory(Addr, VarDesc.Cat); } - } - return false; + }); }; for (const auto &VarDesc : Config.Variables) { - for (const auto &Fun : Code.getAllFunctions()) { + for (const auto *Fun : Code.getAllFunctions()) { const bool MatchingFunScope = [&] { if (VarDesc.Scope == Fun->getName()) { return true; @@ -194,21 +192,22 @@ LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &Code, }(); for (const auto &I : llvm::instructions(Fun)) { - if (MatchingFunScope && AddVariable(VarDesc, I)) { - continue; + if (MatchingFunScope) { + AddVariable(VarDesc, I); } if (!StructConfigMap.empty()) { - // Ignorning line numbers for getElementPtr instructions + // Ignoring line numbers for getElementPtr instructions. + // starts_with covers the edge case where the same name appears as + // both a local variable and a struct member + // (JsonConfig/fun_member_02) if (const auto *Gep = llvm::dyn_cast(&I)) { const auto *StType = llvm::dyn_cast(Gep->getSourceElementType()); - if (StType && StructConfigMap.contains(StType)) { - auto VarName = StructConfigMap.at(StType); - // using substr to cover the edge case in which same variable - // name is present as a local variable and also as a struct - // member variable. (Ex. JsonConfig/fun_member_02.cpp) - if (Gep->getName().starts_with(VarName)) { + if (StType) { + if (auto It = StructConfigMap.find(StType); + It != StructConfigMap.end() && + Gep->getName().starts_with(It->second)) { addTaintCategory(Gep, VarDesc.Cat); } } @@ -224,10 +223,10 @@ LLVMTaintConfig::LLVMTaintConfig(const psr::LLVMProjectIRDB &AnnotatedCode) { llvm::SmallVector VarAnnotations{}; llvm::SmallVector PtrAnnotations{}; for (const auto *F : AnnotatedCode.getAllFunctions()) { - if (F->getName().starts_with("llvm.var.annotation")) { + if (F->getIntrinsicID() == llvm::Intrinsic::var_annotation) { VarAnnotations.push_back(F); } - if (F->getName().starts_with("llvm.ptr.annotation")) { + if (F->getIntrinsicID() == llvm::Intrinsic::ptr_annotation) { PtrAnnotations.push_back(F); } } @@ -343,13 +342,13 @@ void LLVMTaintConfig::addTaintCategory(const llvm::Value *Val, // bool LLVMTaintConfig::isSourceImpl(const llvm::Value *V) const { - return SourceValues.count(V); + return SourceValues.contains(V); } bool LLVMTaintConfig::isSinkImpl(const llvm::Value *V) const { - return SinkValues.count(V); + return SinkValues.contains(V); } bool LLVMTaintConfig::isSanitizerImpl(const llvm::Value *V) const { - return SanitizerValues.count(V); + return SanitizerValues.contains(V); } void LLVMTaintConfig::forAllGeneratedValuesAtImpl( @@ -366,7 +365,7 @@ void LLVMTaintConfig::forAllGeneratedValuesAtImpl( if (Callee) { for (const auto &Arg : Callee->args()) { - if (SourceValues.count(&Arg)) { + if (SourceValues.contains(&Arg)) { Handler(Inst->getOperand(Arg.getArgNo())); } } @@ -376,13 +375,13 @@ void LLVMTaintConfig::forAllGeneratedValuesAtImpl( /// If any function is called with a variable that was defined as source, we /// don't want to re-generate the value. for (const auto *Op : Inst->operand_values()) { - if (SourceValues.count(Op)) { + if (SourceValues.contains(Op)) { Handler(Op); } } } - if (SourceValues.count(Inst)) { + if (SourceValues.contains(Inst)) { Handler(Inst); } } @@ -401,7 +400,7 @@ void LLVMTaintConfig::forAllLeakCandidatesAtImpl( if (Callee) { for (const auto &Arg : Callee->args()) { - if (SinkValues.count(&Arg)) { + if (SinkValues.contains(&Arg)) { Handler(Inst->getOperand(Arg.getArgNo())); } } @@ -427,7 +426,7 @@ void LLVMTaintConfig::forAllSanitizedValuesAtImpl( if (Callee) { for (const auto &Arg : Callee->args()) { - if (SanitizerValues.count(&Arg)) { + if (SanitizerValues.contains(&Arg)) { Handler(Inst->getOperand(Arg.getArgNo())); } } @@ -448,7 +447,7 @@ bool LLVMTaintConfig::generatesValuesAtImpl( return true; } - return SourceValues.count(Inst) || + return SourceValues.contains(Inst) || llvm::any_of(Inst->operand_values(), [this](const auto *Op) { return SourceValues.count(Op); }); diff --git a/lib/PhasarLLVM/Utils/LLVMIRToSrc.cpp b/lib/PhasarLLVM/Utils/LLVMIRToSrc.cpp index d0240bce7e..b35c6b3aba 100644 --- a/lib/PhasarLLVM/Utils/LLVMIRToSrc.cpp +++ b/lib/PhasarLLVM/Utils/LLVMIRToSrc.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Demangle/Demangle.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -24,6 +25,9 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#if LLVM_VERSION_MAJOR > 18 +#include "llvm/IR/DebugProgramInstruction.h" +#endif #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -549,3 +553,33 @@ std::optional psr::getDebugLocation(const llvm::Value *V) { return std::nullopt; } + +void psr::forEachDbgDeclare( + const llvm::Instruction &I, + llvm::function_ref + Callback) { +#if LLVM_VERSION_MAJOR > 18 + for (const llvm::DbgVariableRecord &DbR : + llvm::filterDbgVars(I.getDbgRecordRange())) { + if (DbR.isDbgDeclare()) { + Callback(DbR.getVariable(), DbR.getAddress()); + } + } +#endif + if (const auto *D = llvm::dyn_cast(&I)) { + Callback(D->getVariable(), D->getAddress()); + } +} + +const llvm::DICompositeType * +psr::stripTypedefsToStruct(const llvm::DIType *Ty) { + while (const auto *D = llvm::dyn_cast_or_null(Ty)) { + if (D->getTag() != llvm::dwarf::DW_TAG_typedef) { + return nullptr; + } + Ty = D->getBaseType(); + } + const auto *CT = llvm::dyn_cast_or_null(Ty); + return (CT && CT->getTag() == llvm::dwarf::DW_TAG_structure_type) ? CT + : nullptr; +} diff --git a/test/llvm_test_code/TaintConfig/JsonConfig/CMakeLists.txt b/test/llvm_test_code/TaintConfig/JsonConfig/CMakeLists.txt index 25a6cd9c83..1d4c6525fc 100644 --- a/test/llvm_test_code/TaintConfig/JsonConfig/CMakeLists.txt +++ b/test/llvm_test_code/TaintConfig/JsonConfig/CMakeLists.txt @@ -6,6 +6,7 @@ set(JsonTaintConfigTestSources basic_03.c basic_04.c data_member_01.cpp + data_member_02.cpp fun_member_01.cpp fun_member_02.cpp name_mangling_01.cpp @@ -25,6 +26,7 @@ set(JsonTaintConfigFiles basic_03_config.json basic_04_config.json data_member_01_config.json + data_member_02_config.json fun_member_01_config.json fun_member_02_config.json name_mangling_01_config.json diff --git a/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02.cpp b/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02.cpp new file mode 100644 index 0000000000..b8dac9af49 --- /dev/null +++ b/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02.cpp @@ -0,0 +1,15 @@ + +struct Y { + int A = 99; +}; + +struct X { + int A = 13; + int B = 0; +}; + +int main() { + X V; + Y W; + return V.A + V.B + W.A; +} diff --git a/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02_config.json b/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02_config.json new file mode 100644 index 0000000000..91465c04ad --- /dev/null +++ b/test/llvm_test_code/TaintConfig/JsonConfig/data_member_02_config.json @@ -0,0 +1,13 @@ +{ + "name": "taint-config-test", + "version": 1.0, + "variables": [ + { + "file": "data_member_02.cpp", + "scope": "X", + "name": "A", + "line": 7, + "cat": "source" + } + ] +} diff --git a/unittests/PhasarLLVM/TaintConfig/TaintConfigTest.cpp b/unittests/PhasarLLVM/TaintConfig/TaintConfigTest.cpp index b25df13084..5dac6f01f7 100644 --- a/unittests/PhasarLLVM/TaintConfig/TaintConfigTest.cpp +++ b/unittests/PhasarLLVM/TaintConfig/TaintConfigTest.cpp @@ -340,6 +340,29 @@ TEST_F(TaintConfigTest, DataMember_01_Json) { ASSERT_TRUE(TConfig.isSource(I)); } +TEST_F(TaintConfigTest, DataMember_02_Json) { + // Regression test for + // https://github.com/secure-software-engineering/phasar/issues/835 Two + // structs (X and Y) both have a field named "A". Only X::A is configured as a + // source. Verify X::A GEPs are sources and Y::A GEPs are not. + const std::string File = "data_member_02_cpp_dbg.ll"; + const std::string Config = "data_member_02_config.json"; + auto JsonConfig = + psr::parseTaintConfig(PathToJsonTaintConfigTestCode + Config); + psr::LLVMProjectIRDB IR({PathToJsonTaintConfigTestCode + File}); + psr::LLVMTaintConfig TConfig(IR, JsonConfig); + + // GEP for X::A in X's default constructor — should be source + const llvm::Value *XFieldA = + testingLocInIR(LineColFun{7, 7, "_ZN1XC2Ev"}, IR); + ASSERT_TRUE(TConfig.isSource(XFieldA)); + + // GEP for Y::A in Y's default constructor — must NOT be source + const llvm::Value *YFieldA = + testingLocInIR(LineColFun{3, 7, "_ZN1YC2Ev"}, IR); + ASSERT_FALSE(TConfig.isSource(YFieldA)); +} + TEST_F(TaintConfigTest, FunMember_01_Json) { const std::string File = "fun_member_01_cpp_dbg.ll"; const std::string Config = "fun_member_01_config.json";