jbr-ai-labs · DimaOrekhov · Oct 28, 2021 · Nov 4, 2021 · Nov 4, 2021 · Nov 8, 2021
diff --git a/notebooks/dev.ipynb b/notebooks/dev.ipynb
@@ -0,0 +1,296 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "bb381fe8-0977-4f4e-82f1-800f81095cd0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using backend: pytorch\n"
+     ]
+    }
+   ],
+   "source": [
+    "import typing as ty\n",
+    "import torch\n",
+    "import dgl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3ce89739-301e-40c0-8d3a-56cf8c7c4fd6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# entities: 14541\n",
+      "# relations: 237\n",
+      "# training edges: 272115\n",
+      "# validation edges: 17535\n",
+      "# testing edges: 20466\n",
+      "Done loading data from cached files.\n"
+     ]
+    }
+   ],
+   "source": [
+    "fb15 = dgl.data.FB15k237Dataset()\n",
+    "graph = fb15[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6ea9bf48-27fc-44c4-a432-14ea3992e349",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Graph(num_nodes=14541, num_edges=620232,\n",
+       "      ndata_schemes={'ntype': Scheme(shape=(), dtype=torch.int64)}\n",
+       "      edata_schemes={'etype': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'test_edge_mask': Scheme(shape=(), dtype=torch.bool), 'valid_edge_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_edge_mask': Scheme(shape=(), dtype=torch.bool)})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c2f95bc2-f5a9-4546-8224-2d4b9fc040ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_split(graph: dgl.DGLGraph, split_key: str) -> ty.Tuple[dgl.DGLGraph, torch.Tensor]:\n",
+    "    split_mask = graph.edata[f\"{split_key}_edge_mask\"]\n",
+    "    split_edges_index = torch.nonzero(split_mask, as_tuple=False).squeeze()\n",
+    "\n",
+    "    split_graph = graph.edge_subgraph(split_edges_index, preserve_nodes=True)\n",
+    "    split_graph.edata[\"etype\"] = graph.edata[\"etype\"][split_edges_index]\n",
+    "\n",
+    "    return split_graph, split_edges_index\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0301b4cb-893b-4da1-9139-801be540eee4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_g, train_edges = get_split(graph, \"train\")\n",
+    "val_g, val_edges = get_split(graph, \"valid\")\n",
+    "test_g, test_edges = get_split(graph, \"test\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e71631b6-d27f-4267-a6f7-7136fae58e1c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(544230)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph.edata[\"train_mask\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ab75f30d-872f-4ff5-bbd8-c8d0fc743f20",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(272115)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph.edata[\"train_edge_mask\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "20868465-5182-47bb-b554-bcd5219df7ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14541"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph.number_of_src_nodes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "08e286d3-2e20-4894-be33-a51a9532ae75",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14541"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph.number_of_dst_nodes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "37e7aa6c-632d-4d8d-b396-53c9390999d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14541"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "graph.number_of_nodes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "81c8024f-25fa-41a5-85e6-0a0892a6e392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "from torch.utils.data import Dataset\n",
+    "\n",
+    "\n",
+    "class DirectedDglGraphDataset(Dataset):\n",
+    "\n",
+    "    def __init__(self, graph: dgl.DGLGraph):\n",
+    "        self.graph = graph\n",
+    "        self.number_of_nodes = graph.number_of_nodes()\n",
+    "        self.adjacency_mat = graph.adj()\n",
+    "\n",
+    "    def __getitem__(self, i):\n",
+    "        denominator = self.number_of_nodes - 1\n",
+    "        src_node_index = i // denominator\n",
+    "        dst_node_index = i % denominator\n",
+    "\n",
+    "        if src_node_index <= dst_node_index:\n",
+    "            dst_node_index += 1\n",
+    "\n",
+    "        return {\n",
+    "            \"src_node_index\": src_node_index,\n",
+    "            \"dst_node_index\": dst_node_index,\n",
+    "            \"relation\": self.adjacency_mat[src_node_index, dst_node_index]\n",
+    "        }\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return self.number_of_nodes * (self.number_of_nodes - 1)\n",
+    "\n",
+    "\n",
+    "# Very model specific dataset since it fixates what is a head and what is a tail entites\n",
+    "class UndirectedDglGraphDataset(Dataset):\n",
+    "\n",
+    "    def __init__(self, graph: dgl.DGLGraph):\n",
+    "        self.graph = graph\n",
+    "        self.number_of_nodes = graph.number_of_nodes()\n",
+    "        self.adjacency_mat = graph.adj()\n",
+    "        self.sample = [\n",
+    "            (i, j)\n",
+    "            for i, j in itertools.product(range(self.number_of_nodes), range(self.number_of_nodes))\n",
+    "            if i < j # Take only upper triangle indices\n",
+    "        ]\n",
+    "\n",
+    "    def __getitem__(self, i):\n",
+    "        src_node_index, dst_node_index = self.sample[i]\n",
+    "\n",
+    "        return {\n",
+    "            \"src_node_index\": src_node_index,\n",
+    "            \"dst_node_index\": dst_node_index,\n",
+    "            \"relation\": self.adjacency_mat[src_node_index, dst_node_index]\n",
+    "        }\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return (self.number_of_nodes * (self.number_of_nodes - 1)) // 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7893bed-6853-451f-826e-baf373b98331",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}