Merge pull request #28 from MITLibraries/tco-157

matt-bernhardt · web-flow · commit e728569bea6c · 2025-06-25T10:15:26.000-04:00
Implement prediction logic via neural network algorithm
diff --git a/Makefile b/Makefile
@@ -1,5 +1,7 @@
 SHELL=/bin/bash
 DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)
+PAYLOAD:=tests/sam/citation.json
+
 ### This is the Terraform-generated header for tacos-detectors-lambdas-dev. If  ###
 ###   this is a Lambda repo, uncomment the FUNCTION line below  ###
 ###   and review the other commented lines in the document.     ###
@@ -77,33 +79,13 @@ sam-http-run: # Run lambda locally as an HTTP server
 
 sam-http-ping: # Send curl command to SAM HTTP server using the ping action
 	curl --location 'http://localhost:3000/foo' \
-	--header 'Content-Type: application\json' \
+	--header 'Content-Type: application/json' \
 	--data '{"action":"ping", "challenge_secret": "secret_phrase"}'
 
 sam-http-predict: # Send curl command to SAM HTTP server using the predict action (next step - take file argument?)
 	curl --location 'http://localhost:3000/foo' \
-	--header 'Content-Type: application\json' \
-	--data '{ \
-		"action": "predict", \
-		"challenge_secret": "secret_phrase", \
-		"features": { \
-			"apa": 0, \
-			"brackets": 0, \
-			"colons": 0, \
-			"commas": 0, \
-			"lastnames": 0, \
-			"no": 0, \
-			"pages": 0, \
-			"periods": 0, \
-			"pp": 0, \
-			"quotes": 0, \
-			"semicolons": 0, \
-			"vol": 0, \
-			"words": 0, \
-			"year":0 \
-		} \
-		}'
-
+	--header 'Content-Type: application/json' \
+	--data '@$(PAYLOAD)'
 
 ### Terraform-generated Developer Deploy Commands for Dev environment ###
 dist-dev: ## Build docker container (intended for developer-based manual build)
diff --git a/Pipfile b/Pipfile
@@ -6,6 +6,9 @@ name = "pypi"
 [packages]
 sentry-sdk = "*"
 jsonschema = "*"
+pandas = "*"
+scikit-learn = "*"
+pandas-stubs = "*"
 
 [dev-packages]
 black = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -86,24 +86,51 @@ the lambda does not utilize them in request payload.
 
 3. In another terminal, perform an HTTP request via another `Makefile` command:
 
+The server's baseline readiness can be confirmed via the ping action:
 ```shell
 make sam-http-ping
 ```
 
-Response should have an HTTP status of `200` and respond with:
+The response should have an HTTP status of `200` and respond with:
 
 ```json
 {
   "response": "pong"
 }
 ```
 
+Actual predictions can be sent in via the predict action:
+
+```shell
+make sam-http-predict
+```
+
+```json
+{
+  "response": "True"
+}
+```
+
+Custom payloads can be found in the `tests/sam` directory, and the default payload overridden via the `PAYLOAD` Makefile
+argument:
+
+```shell
+make sam-http-predict PAYLOAD=tests/sam/noncitation.json
+```
+
+```json
+{
+  "response": "False"
+}
+```
+
 ### Invoking lambda directly
 
 While lambdas can be invoked via HTTP methods (ALB, Function URL, etc), they are also often invoked directly with an
 `event` payload. To do so with SAM, you do **not** need to first start an HTTP server with `make sam-run`, you can
 invoke the function image directly:
 
+#### Example 1: ping
 ```shell
 echo '{"action": "ping", "challenge_secret": "secret_phrase"}' | sam local invoke --env-vars tests/sam/env.json -e -
 ```
@@ -115,7 +142,23 @@ Response:
 false, "body": "{\"response\": \"pong\"}"}
 ```
 
-As you can see from this response, the lambda is still returning a dictionary that _would_ work for an HTTP response,
+#### Example 2: predict
+
+The JSON files with example payloads in `tests/sam` can be helpful for working with the `predict` action, rather than
+trying to include all features and values directly within an echo command:
+
+```shell
+echo "$(cat tests/sam/citation.json)" | sam local invoke --env-vars tests/sam/env.json -e -
+```
+
+Response:
+
+```text
+{"statusCode": 200, "statusDescription": "200 OK", "headers": {"Content-Type": "application/json"}, "isBase64Encoded":
+false, "body": "{\"response\": \"True\"}"}
+```
+
+As you can see from these responses, the lambda is still returning a dictionary that _would_ work for an HTTP response,
 but is actually just a dictionary with the required information.
 
 It's unknown at this time if this lambda will get invoked via non-HTTP methods, but SAM will be helpful for testing and
diff --git a/lambdas/models/neural.pkl b/lambdas/models/neural.pkl
diff --git a/lambdas/predict.py b/lambdas/predict.py
@@ -3,7 +3,9 @@
 from abc import ABC, abstractmethod
 from dataclasses import asdict, dataclass
 from http import HTTPStatus
+from pickle import load
 
+import pandas as pd
 from jsonschema import ValidationError, validate
 
 from lambdas.config import Config, configure_sentry
@@ -41,13 +43,28 @@ def handle(self, _payload: InputPayload) -> dict:
 class PredictHandler(RequestHandler):
     """Handle prediction requests."""
 
+    def load_model(self) -> None:
+        """Load the machine learning model, and confirm it is fitted.
+
+        Please note that this method does not have a return value. It populates
+        the `self.model` attribute with the loaded model.
+        """
+        path = "lambdas/models/neural.pkl"
+        with open(path, "rb") as f:
+            self.model = load(f)  # noqa: S301
+
     def handle(self, payload: InputPayload) -> dict:
-        # validate payload against a JSONSchema
+        """Validate received payload, load model, and generate prediction."""
         with open("lambdas/schemas/features_schema.json") as f:
             schema = json.load(f)
-        logger.debug(payload.to_dict())
         validate(instance=payload.to_dict(), schema=schema)
-        return {"response": "true"}
+
+        self.load_model()
+
+        data = pd.DataFrame(payload.features, index=[0])
+        prediction = self.model.predict(data)
+
+        return {"response": bool(prediction[0])}
 
 
 class LambdaProcessor:
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,10 @@ disallow_untyped_calls = true
 disallow_untyped_defs = true
 exclude = ["tests/"]
 
+[[tool.mypy.overrides]]
+module = ["sklearn.*"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 log_level = "INFO"
 
@@ -41,7 +45,6 @@ ignore = [
     "PLR0912",
     "PLR0913",
     "PLR0915",
-    "S320",
     "S321",
 ]
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,8 +19,38 @@ def valid_ping_event():
 
 
 @pytest.fixture
-def valid_predict_event():
-    """Valid event payload for an HTTP invocation."""
+def valid_predict_event_citation():
+    """Valid event payload with features extracted from a known citation."""
+    return {
+        "body": json.dumps(
+            {
+                "action": "predict",
+                "challenge_secret": "secret_phrase",
+                "features": {
+                    "apa": 0,
+                    "brackets": 0,
+                    "colons": 0,
+                    "commas": 5,
+                    "lastnames": 4,
+                    "no": 0,
+                    "pages": 0,
+                    "periods": 7,
+                    "pp": 0,
+                    "quotes": 0,
+                    "semicolons": 1,
+                    "vol": 0,
+                    "words": 12,
+                    "year": 0,
+                },
+            }
+        ),
+        "requestContext": {"http": {"method": "POST"}},
+    }
+
+
+@pytest.fixture
+def valid_predict_event_noncitation():
+    """Valid event payload with features extracted from a non-citation."""
     return {
         "body": json.dumps(
             {
@@ -39,7 +69,7 @@ def valid_predict_event():
                     "quotes": 0,
                     "semicolons": 0,
                     "vol": 0,
-                    "words": 0,
+                    "words": 1,
                     "year": 0,
                 },
             }
diff --git a/tests/sam/citation.json b/tests/sam/citation.json
@@ -0,0 +1,20 @@
+{
+	"action": "predict",
+	"challenge_secret": "secret_phrase",
+	"features": {
+		"apa": 0,
+		"brackets": 0,
+		"colons": 0,
+		"commas": 5,
+		"lastnames": 4,
+		"no": 0,
+		"pages": 0,
+		"periods": 7,
+		"pp": 0,
+		"quotes": 0,
+		"semicolons": 1,
+		"vol": 0,
+		"words": 12,
+		"year": 0
+	}
+}
diff --git a/tests/sam/noncitation.json b/tests/sam/noncitation.json
@@ -0,0 +1,20 @@
+{
+	"action": "predict",
+	"challenge_secret": "secret_phrase",
+	"features": {
+		"apa": 0,
+		"brackets": 0,
+		"colons": 0,
+		"commas": 0,
+		"lastnames": 0,
+		"no": 0,
+		"pages": 0,
+		"periods": 0,
+		"pp": 0,
+		"quotes": 0,
+		"semicolons": 0,
+		"vol": 0,
+		"words": 1,
+		"year": 0
+	}
+}
diff --git a/tests/test_predict.py b/tests/test_predict.py
@@ -68,12 +68,20 @@ def test_lambda_handler_ping_valid(valid_ping_event):
 
 
 # Prediction action
-def test_lambda_handler_predict_valid(valid_predict_event):
-    """Test lambda_handler with a valid HTTP event."""
-    response = predict.lambda_handler(valid_predict_event, {})
+def test_lambda_handler_predict_citation(valid_predict_event_citation):
+    """Test lambda_handler with a valid HTTP event for a citation."""
+    response = predict.lambda_handler(valid_predict_event_citation, {})
+    assert response["statusCode"] == HTTPStatus.OK
+    body = json.loads(response["body"])
+    assert body["response"]
+
+
+def test_lambda_handler_predict_noncitation(valid_predict_event_noncitation):
+    """Test lambda_handler with a valid HTTP event for a non-citation."""
+    response = predict.lambda_handler(valid_predict_event_noncitation, {})
     assert response["statusCode"] == HTTPStatus.OK
     body = json.loads(response["body"])
-    assert body["response"] == "true"
+    assert not body["response"]
 
 
 def test_lambda_handler_predict_invalid_missing(invalid_predict_event_missing):
@@ -85,8 +93,17 @@ def test_lambda_handler_predict_invalid_missing(invalid_predict_event_missing):
 
 
 def test_lambda_handler_predict_invalid_extra(invalid_predict_event_extra):
-    """Test lambda_handler with less than a full set of prediction features."""
+    """Test lambda_handler with extraneous prediction features."""
     response = predict.lambda_handler(invalid_predict_event_extra, {})
     assert response["statusCode"] == HTTPStatus.BAD_REQUEST
     body = json.loads(response["body"])
     assert body["error"][:37] == "Additional properties are not allowed"
+
+
+def test_predict_handler_is_using_a_fitted_model():
+    """Test that the model loads correctly, which means it is fitted."""
+    predictor = predict.PredictHandler()
+    assert not hasattr(predictor, "model")
+    predictor.load_model()
+    assert hasattr(predictor, "model")
+    assert callable(predictor.model.predict)