Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 106 additions & 45 deletions python/ql/src/Expressions/HashedButNoHash.ql
Original file line number Diff line number Diff line change
Expand Up @@ -12,76 +12,137 @@
*/

import python
private import LegacyPointsTo
import semmle.python.dataflow.new.DataFlow
private import semmle.python.dataflow.new.internal.DataFlowDispatch
private import semmle.python.ApiGraphs

/*
* This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing.
* For sequences, the index must be an int, which are hashable, so we don't need to treat them specially.
* For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially.
/**
* Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable.
*/
predicate setsHashToNone(Class cls) {
DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None
}

/**
* Holds if `cls` has a `@dataclass` decorator with `frozen=True` or `unsafe_hash=True`,
* which generates a `__hash__` method at runtime.
*/
predicate hasDataclassHash(Class cls) {
exists(DataFlow::CallCfgNode decorator |
decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and
decorator.asExpr() = cls.getADecorator() and
decorator.getArgByName(["frozen", "unsafe_hash"]).asExpr().(ImmutableLiteral).booleanValue() =
true
)
}

predicate numpy_array_type(ClassValue na) {
exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" |
na.getASuperType() = np.attr("ndarray")
/**
* Holds if `cls` defines `__eq__` directly (not inherited from `object`),
* or has a `@dataclass` decorator that generates `__eq__` (the default).
*/
predicate definesEq(Class cls) {
DuckTyping::hasMethod(cls, "__eq__")
or
// @dataclass(...) call generates __eq__ unless eq=False
exists(DataFlow::CallCfgNode decorator |
decorator = API::moduleImport("dataclasses").getMember("dataclass").getACall() and
decorator.asExpr() = cls.getADecorator() and
not decorator.getArgByName("eq").asExpr().(ImmutableLiteral).booleanValue() = false
)
or
// bare @dataclass without arguments also generates __eq__
cls.getADecorator() =
API::moduleImport("dataclasses").getMember("dataclass").getAValueReachableFromSource().asExpr()
}

predicate has_custom_getitem(Value v) {
v.getClass().lookup("__getitem__") instanceof PythonFunctionValue
/**
* Holds if `cls` is a user-defined class whose instances are unhashable.
*
* In Python, a class is unhashable if:
* - It explicitly sets `__hash__ = None`, or
* - It defines `__eq__` (directly or via decorator like `@dataclass`) without
* defining `__hash__`, and doesn't have a decorator that generates `__hash__`.
*/
predicate isUnhashableUserClass(Class cls) {
setsHashToNone(cls)
or
numpy_array_type(v.getClass())
// In Python 3, defining __eq__ without __hash__ makes a class unhashable.
// This rule does not apply in Python 2.
major_version() = 3 and
DuckTyping::isNewStyle(cls) and
definesEq(cls) and
not DuckTyping::hasMethod(cls, "__hash__") and
not hasDataclassHash(cls) and
not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls))
}

predicate explicitly_hashed(ControlFlowNode f) {
exists(CallNode c, GlobalVariable hash |
c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash"
/**
* Gets the name of a builtin type whose instances are unhashable.
*/
string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] }

/**
* Holds if `origin` is a local source node tracking an unhashable instance that
* flows to `node`, with `clsName` describing the class for the alert.
*/
predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
exists(Class c |
isUnhashableUserClass(c) and
origin = classInstanceTracker(c) and
origin.flowsTo(node) and
clsName = c.getName()
)
or
clsName = getUnhashableBuiltinName() and
origin = API::builtin(clsName).getAnInstance().asSource() and
origin.flowsTo(node)
}

predicate explicitly_hashed(DataFlow::Node node) {
node = API::builtin("hash").getACall().getArg(0)
}

predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) {
is_unhashable(f, c, origin) and
exists(SubscriptNode sub | sub.getIndex() = f |
exists(Value custom_getitem |
sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and
not has_custom_getitem(custom_getitem)
)
/**
* Holds if the subscript object in `sub[...]` is known to use hashing for indexing,
* i.e. it does not have a custom `__getitem__` that could accept unhashable indices.
*/
predicate subscriptUsesHashing(Subscript sub) {
DataFlow::exprNode(sub.getObject()) =
API::builtin("dict").getAnInstance().getAValueReachableFromSource()
or
exists(Class cls |
classInstanceTracker(cls)
.(DataFlow::LocalSourceNode)
.flowsTo(DataFlow::exprNode(sub.getObject())) and
not DuckTyping::hasMethod(cls, "__getitem__")
)
}

predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) {
exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls |
not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle()
or
cls.lookup("__hash__") = Value::named("None")
predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
exists(Subscript sub |
node = DataFlow::exprNode(sub.getIndex()) and
subscriptUsesHashing(sub)
|
isUnhashable(origin, node, clsName)
)
}

/**
* Holds if `f` is inside a `try` that catches `TypeError`. For example:
*
* try:
* ... f ...
* except TypeError:
* ...
*
* This predicate is used to eliminate false positive results. If `hash`
* is called on an unhashable object then a `TypeError` will be thrown.
* But this is not a bug if the code catches the `TypeError` and handles
* it.
* Holds if `e` is inside a `try` that catches `TypeError`.
*/
predicate typeerror_is_caught(ControlFlowNode f) {
predicate typeerror_is_caught(Expr e) {
exists(Try try |
try.getBody().contains(f.getNode()) and
try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError())
try.getBody().contains(e) and
try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr()
)
}

from ControlFlowNode f, ClassValue c, ControlFlowNode origin
from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName
where
not typeerror_is_caught(f) and
not typeerror_is_caught(node.asExpr()) and
(
explicitly_hashed(f) and is_unhashable(f, c, origin)
explicitly_hashed(node) and isUnhashable(origin, node, clsName)
or
unhashable_subscript(f, c, origin)
unhashable_subscript(origin, node, clsName)
)
select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName()
select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
| test.py:17:17:17:19 | ControlFlowNode for obj | This $@ of $@ is unhashable. | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | instance | test.py:16:11:16:23 | ControlFlowNode for HasEqNoHash() | HasEqNoHash |
| test.py:48:17:48:17 | ControlFlowNode for p | This $@ of $@ is unhashable. | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | instance | test.py:47:9:47:27 | ControlFlowNode for MutableDataclass() | MutableDataclass |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Expressions/HashedButNoHash.ql

Check warning

Code scanning / CodeQL

Query test without inline test expectations Warning test

Query test does not use inline test expectations.
48 changes: 48 additions & 0 deletions python/ql/test/3/query-tests/Expressions/general/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Classes without __eq__ are hashable (inherit __hash__ from object)
class NoEqClass:
def __init__(self, x):
self.x = x

def hash_no_eq():
obj = NoEqClass(1)
return hash(obj) # OK: no __eq__, so __hash__ is inherited from object

# Classes with __eq__ but no __hash__ are unhashable
class HasEqNoHash:
def __eq__(self, other):
return self.x == other.x

def hash_eq_no_hash():
obj = HasEqNoHash()
return hash(obj) # should be flagged

# @dataclass(frozen=True) generates __hash__, so instances are hashable
from dataclasses import dataclass

@dataclass(frozen=True)
class FrozenPoint:
x: int
y: int

def hash_frozen_dataclass():
p = FrozenPoint(1, 2)
return hash(p) # OK: frozen dataclass has __hash__

# @dataclass(unsafe_hash=True) also generates __hash__
@dataclass(unsafe_hash=True)
class UnsafeHashPoint:
x: int
y: int

def hash_unsafe_hash_dataclass():
p = UnsafeHashPoint(1, 2)
return hash(p) # OK: unsafe_hash=True generates __hash__

# Plain @dataclass without frozen/unsafe_hash and with __eq__ is unhashable
@dataclass
class MutableDataclass:
x: int

def hash_mutable_dataclass():
p = MutableDataclass(1)
return hash(p) # should be flagged: @dataclass generates __eq__ but not __hash__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
| expressions_test.py:42:20:42:25 | unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | file://:0:0:0:0 | builtin-class list | list |
| expressions_test.py:42:20:42:25 | ControlFlowNode for unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | list |
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,12 @@ def local():
def apply(f):
pass
apply(foo)([1])

# NoEqClass is hashable (no __eq__ => inherits __hash__ from object).
class NoEqClass:
def __init__(self, x):
self.x = x

def hash_no_eq():
obj = NoEqClass(1)
return hash(obj) # OK: no __eq__, so __hash__ is inherited from object
Loading