From f8a3d95727e38ecf2a8a8e7b882f82c01409825e Mon Sep 17 00:00:00 2001 From: Alkid Date: Sun, 3 Dec 2023 22:16:49 +0100 Subject: [PATCH 01/31] updated references --- examples/advanced-usage-NCES.ipynb | 2 +- examples/celoe_notebook.ipynb | 2 +- examples/custom-LPs-NCES.ipynb | 2 +- examples/evolearner_notebook.ipynb | 2 +- examples/ocel_notebook.ipynb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/advanced-usage-NCES.ipynb b/examples/advanced-usage-NCES.ipynb index 59caadec..673e22c7 100644 --- a/examples/advanced-usage-NCES.ipynb +++ b/examples/advanced-usage-NCES.ipynb @@ -6,7 +6,7 @@ "id": "immune-fluid", "metadata": {}, "source": [ - "From the main directory \"Ontolearn\", run the commands for NCES data in [`./download_external_resources.sh`](../download_external_resources.sh) to download pretrained models and datasets." + "From the main directory \"Ontolearn\", run the commands for NCES data mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download pretrained models and datasets." ] }, { diff --git a/examples/celoe_notebook.ipynb b/examples/celoe_notebook.ipynb index fdc8216f..9184bdff 100644 --- a/examples/celoe_notebook.ipynb +++ b/examples/celoe_notebook.ipynb @@ -8,7 +8,7 @@ "source": [ "# CELOE Notebook\n", "This is a jupyter notebook file to execute [CELOE](ontolearn.concept_learner.CELOE) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution.\n", - "Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets in [`./download_external_resources.sh`](../download_external_resources.sh) to download the datasets." + "Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download the datasets." ] }, { diff --git a/examples/custom-LPs-NCES.ipynb b/examples/custom-LPs-NCES.ipynb index d7465a7c..5264a86c 100644 --- a/examples/custom-LPs-NCES.ipynb +++ b/examples/custom-LPs-NCES.ipynb @@ -6,7 +6,7 @@ "id": "supposed-stone", "metadata": {}, "source": [ - "From the main directory \"Ontolearn\", run the commands for NCES data in [`./download_external_resources.sh`](../download_external_resources.sh) to download pretrained models and datasets." + "From the main directory \"Ontolearn\", run the commands for NCES data mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download pretrained models and datasets." ] }, { diff --git a/examples/evolearner_notebook.ipynb b/examples/evolearner_notebook.ipynb index 18725bf0..268dfc01 100644 --- a/examples/evolearner_notebook.ipynb +++ b/examples/evolearner_notebook.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "source": [ "# EvoLearner Notebook\n", - "This is a jupyter notebook file to execute [EvoLearner](ontolearn.concept_learner.EvoLearner) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution. Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets in [`./download_external_resources.sh`](../download_external_resources.sh) to download the datasets." + "This is a jupyter notebook file to execute [EvoLearner](ontolearn.concept_learner.EvoLearner) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution. Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download the datasets." ] }, { diff --git a/examples/ocel_notebook.ipynb b/examples/ocel_notebook.ipynb index 17d6c0cd..0fa911d8 100644 --- a/examples/ocel_notebook.ipynb +++ b/examples/ocel_notebook.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "source": [ "# OCEL Notebook\n", - "This is a jupyter notebook file to execute [OCEL](ontolearn.concept_learner.OCEL) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution. Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets in [`./download_external_resources.sh`](../download_external_resources.sh) to download the datasets." + "This is a jupyter notebook file to execute [OCEL](ontolearn.concept_learner.OCEL) and generate predictive results. We recommend you to see the [concept learners](../docs/usage/06_concept_learners.md) guide before continuing with the execution. Also if you have not done it already, from the main directory \"Ontolearn\", run the commands for Datasets mentioned [here](https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files) to download the datasets." ] }, { From 53588af43ca88ed203f23925a4508011eee4c94e Mon Sep 17 00:00:00 2001 From: Alkid Date: Sun, 3 Dec 2023 22:17:59 +0100 Subject: [PATCH 02/31] moved compute_quality --- deploy_cl.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/deploy_cl.py b/deploy_cl.py index afc67138..dbaf8ee0 100644 --- a/deploy_cl.py +++ b/deploy_cl.py @@ -3,7 +3,7 @@ from argparse import ArgumentParser import random import os - +from ontolearn.model_adapter import compute_quality from ontolearn.ea_algorithms import EASimple from ontolearn.ea_initialization import EARandomWalkInitialization, RandomInitMethod, EARandomInitialization from ontolearn.fitness_functions import LinearPressureFitness @@ -32,18 +32,6 @@ renderer = DLSyntaxObjectRenderer() -def compute_quality(KB, solution, pos, neg, qulaity_func="F1"): - func = metrics[qulaity_func]().score2 - instances = set(KB.individuals(solution)) - if isinstance(list(pos)[0], str): - instances = {ind.get_iri().as_str().split("/")[-1] for ind in instances} - tp = len(pos.intersection(instances)) - fn = len(pos.difference(instances)) - fp = len(neg.intersection(instances)) - tn = len(neg.difference(instances)) - return func(tp=tp, fn=fn, fp=fp, tn=tn)[-1] - - def setup_prerequisites(individuals, pos_ex, neg_ex, random_ex: bool, size_of_ex): # start_time = time.time() @@ -661,9 +649,11 @@ def run(args): if not os.path.exists("NCESData/") and args.model == "nces": print("\nWarning! You are trying to deploy NCES without the NCES data!") - print(f"Please download the necessary files first: see ./download_external_resources.sh\n") + print(f"Please download the necessary files first: see " + f"https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files\n") elif not os.path.exists("KGs") and "KGs/" in args.path_knowledge_base: print("\nWarning! There is no 'KGs' folder!") - print(f"Please download the datasets first: see ./download_external_resources.sh\n") + print(f"Please download the datasets first: " + f"see https://ontolearn-docs-dice-group.netlify.app/usage/02_installation#download-external-files\n") else: run(args) From d69e1e3aa07f804c8dad4f2713fcccebdd4c5f6b Mon Sep 17 00:00:00 2001 From: Alkid Date: Sun, 3 Dec 2023 22:18:41 +0100 Subject: [PATCH 03/31] added NCESData to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8ce1c646..c0ff3c7b 100644 --- a/.gitignore +++ b/.gitignore @@ -157,3 +157,4 @@ embeddings.zip KGs.zip /Fuseki/ /KGs/ +/NCESData/ From f9de169c9a479c30fc6ac26277ce1ddde4b864a7 Mon Sep 17 00:00:00 2001 From: Alkid Date: Sun, 3 Dec 2023 22:20:23 +0100 Subject: [PATCH 04/31] Added entry point for ontolearn --- examples/uncle_lp2.json | 81 ++++++++++++++++++++++++ main.py | 109 +++++++++++++++++++++++++++++++++ ontolearn/model_adapter.py | 122 ++++++++++++++++++++++++++++++++++--- setup.py | 1 + 4 files changed, 304 insertions(+), 9 deletions(-) create mode 100644 examples/uncle_lp2.json create mode 100644 main.py diff --git a/examples/uncle_lp2.json b/examples/uncle_lp2.json new file mode 100644 index 00000000..7355e6cd --- /dev/null +++ b/examples/uncle_lp2.json @@ -0,0 +1,81 @@ +{ + "positive_examples": ["http://www.benchmark.org/family#F2M13" +,"http://www.benchmark.org/family#F2M11" +,"http://www.benchmark.org/family#F2M20" +,"http://www.benchmark.org/family#F2M27" +,"http://www.benchmark.org/family#F2M32" +,"http://www.benchmark.org/family#F2M29" +,"http://www.benchmark.org/family#F2M37" +,"http://www.benchmark.org/family#F3M44" +,"http://www.benchmark.org/family#F5M63" +,"http://www.benchmark.org/family#F6M71" +,"http://www.benchmark.org/family#F6M80" +,"http://www.benchmark.org/family#F6M78" +,"http://www.benchmark.org/family#F6M90" +,"http://www.benchmark.org/family#F6M85" +,"http://www.benchmark.org/family#F6M100" +,"http://www.benchmark.org/family#F6M92" +,"http://www.benchmark.org/family#F7M113" +,"http://www.benchmark.org/family#F7M107" +,"http://www.benchmark.org/family#F7M115" +,"http://www.benchmark.org/family#F7M120" +,"http://www.benchmark.org/family#F7M125" +,"http://www.benchmark.org/family#F7M131" +,"http://www.benchmark.org/family#F7M122" +,"http://www.benchmark.org/family#F9M149" +,"http://www.benchmark.org/family#F9M144" +,"http://www.benchmark.org/family#F9M151" +,"http://www.benchmark.org/family#F9M153" +,"http://www.benchmark.org/family#F9M142" +,"http://www.benchmark.org/family#F9M159" +,"http://www.benchmark.org/family#F9M162" +,"http://www.benchmark.org/family#F9M157" +,"http://www.benchmark.org/family#F9M167" +,"http://www.benchmark.org/family#F10M173" +,"http://www.benchmark.org/family#F10M180" +,"http://www.benchmark.org/family#F10M182" +,"http://www.benchmark.org/family#F10M194" +,"http://www.benchmark.org/family#F10M187" +,"http://www.benchmark.org/family#F10M196" +], + "negative_examples": [ +"http://www.benchmark.org/family#F10F198" +,"http://www.benchmark.org/family#F7F108" +,"http://www.benchmark.org/family#F9M165" +,"http://www.benchmark.org/family#F6F82" +,"http://www.benchmark.org/family#F9F148" +,"http://www.benchmark.org/family#F3M43" +,"http://www.benchmark.org/family#F7F103" +,"http://www.benchmark.org/family#F10M188" +,"http://www.benchmark.org/family#F1F3" +,"http://www.benchmark.org/family#F9F156" +,"http://www.benchmark.org/family#F9M147" +,"http://www.benchmark.org/family#F10F191" +,"http://www.benchmark.org/family#F9F160" +,"http://www.benchmark.org/family#F6M95" +,"http://www.benchmark.org/family#F2F14" +,"http://www.benchmark.org/family#F6F94" +,"http://www.benchmark.org/family#F1F2" +,"http://www.benchmark.org/family#F6F86" +,"http://www.benchmark.org/family#F10F174" +,"http://www.benchmark.org/family#F2F12" +,"http://www.benchmark.org/family#F2F28" +,"http://www.benchmark.org/family#F5M60" +,"http://www.benchmark.org/family#F8M134" +,"http://www.benchmark.org/family#F7M117" +,"http://www.benchmark.org/family#F10F189" +,"http://www.benchmark.org/family#F4F55" +,"http://www.benchmark.org/family#F6F76" +,"http://www.benchmark.org/family#F7F119" +,"http://www.benchmark.org/family#F2F36" +,"http://www.benchmark.org/family#F2M9" +,"http://www.benchmark.org/family#F2F38" +,"http://www.benchmark.org/family#F2F22" +,"http://www.benchmark.org/family#F6F89" +,"http://www.benchmark.org/family#F5M64" +,"http://www.benchmark.org/family#F5F67" +,"http://www.benchmark.org/family#F3F53" +,"http://www.benchmark.org/family#F2F26" +,"http://www.benchmark.org/family#F5F65" +] + } diff --git a/main.py b/main.py new file mode 100644 index 00000000..769f6007 --- /dev/null +++ b/main.py @@ -0,0 +1,109 @@ +from ontolearn.model_adapter import execute +from argparse import ArgumentParser + + +def get_default_arguments(description=None): + parser = ArgumentParser() + + parser.add_argument("--model", type=str, default="nces", choices=["celoe", "ocel", "evolearner", "nces"], + help="Available concept learning models.") + + # Knowledge graph related arguments + parser.add_argument("--knowledge_base_path", type=str, default="KGs/Family/family-benchmark_rich_background.owl", + help="Path to the knowledge base/ontology. This file contains '.owl' extension," + "e.g. 'some/path/kb.owl'") + parser.add_argument("--sparql_endpoint", type=str, default=None, + help="An endpoint of a triple store, e.g. 'http://localhost:3030/family/sparql'. ") + parser.add_argument("--path_of_embeddings", type=str, + default='NCESData/family/embeddings/ConEx_entity_embeddings.csv', + help="Path to knowledge base embeddings. Some models like NCES require this," + "e.g. 'some/path/kb_embeddings.csv'") + # Common model arguments + parser.add_argument("--path_learning_problem", type=str, default='examples/uncle_lp2.json', + help="Path to a .json file that contains 2 properties 'positive_examples' and " + "'negative_examples'. Each of this properties should contain the IRIs of the respective" + "instances. e.g. 'some/path/lp.json'") + parser.add_argument("--quality_metric", type=str, default='f1', + choices=["f1", "accuracy", "recall", "precision", "weighted_accuracy"], + help="Quality metric.") + parser.add_argument("--max_runtime", type=int, default=5, help="Maximum runtime.") + + # CELOE, OCEL and Evolearner only + + parser.add_argument('--terminate_on_goal', type=bool, default=True, help="Terminate when finding concept of quality" + "1.0?") + parser.add_argument("--use_card_restrictions", type=bool, default=True, + help="Use cardinality restrictions for object properties?") + parser.add_argument("--use_inverse", type=bool, default=True, help="Use inverse.") + parser.add_argument("--card_limit", type=int, default=10, help="Cardinality limit for object properties.") + parser.add_argument("--max_nr_splits", type=int, default=12, help="Maximum number of splits.") + + # CELOE and OCEL only + parser.add_argument("--max_results", type=int, default=10, help="Maximum results to find (not to show)") + parser.add_argument("--iter_bound", type=int, default=10_000, help="Iterations bound.") + parser.add_argument("--max_num_of_concepts_tested", type=int, default=10_000, + help="Maximum number of concepts tested.") + parser.add_argument("--best_only", type=bool, default=True, help="Best results only?") + parser.add_argument("--calculate_min_max", type=bool, default=True, help="Only for statistical purpose.") + parser.add_argument("--gain_bonus_factor", type=float, default=0.3, + help="Factor that weighs the increase in quality compared to the parent node.") + parser.add_argument("--expansion_penalty_factor", type=float, default=0.1, + help="The value that is subtracted from the heuristic for each horizontal expansion of this") + parser.add_argument("--max_child_length", type=int, default=10, help="Maximum child length") + parser.add_argument("--use_negation", type=bool, default=True, help="Use negation?") + parser.add_argument("--use_all_constructor", type=bool, default=True, help="Use all constructors?") + parser.add_argument("--use_numeric_datatypes", type=bool, default=True, help="Use numeric data types?") + parser.add_argument("--use_time_datatypes", type=bool, default=True, help="Use time datatypes?") + parser.add_argument("--use_boolean_datatype", type=bool, default=True, help="Use boolean datatypes?") + + # CELOE only + parser.add_argument("--start_node_bonus", type=float, default=0.1, help="Special value added to the root node.") + parser.add_argument("--node_refinement_penalty", type=float, default=0.001, help="Node refinement penalty.") + + # EvoLearner Only + parser.add_argument("--use_data_properties", type=bool, default=True, help="Use data properties?") + parser.add_argument("--tournament_size", type=int, default=7, help="Tournament size.") + parser.add_argument("--population_size", type=int, default=800, help="Population size.") + parser.add_argument("--num_generations", type=int, default=200, help="Number of generations.") + parser.add_argument("--height_limit", type=int, default=17, help="Height limit.") + parser.add_argument("--gain", type=int, default=2048, help="Gain.") + parser.add_argument("--penalty", type=int, default=1, help="Penalty.") + parser.add_argument("--max_t", type=int, default=2, help="Number of paths.") + parser.add_argument("--jump_pr", type=float, default=0.5, help="Probability to explore paths of length 2.") + parser.add_argument("--crossover_pr", type=float, default=0.9, help="Crossover probability.") + parser.add_argument("--mutation_pr", type=float, default=0.1, help="Mutation probability") + parser.add_argument("--elitism", type=bool, default=False, help="Elitism.") + parser.add_argument("--elite_size", type=float, default=0.1, help="Elite size") + parser.add_argument("--min_height", type=int, default=1, help="Minimum height of trees") + parser.add_argument("--max_height", type=int, default=3, help="Maximum height of trees") + parser.add_argument("--init_method_type", type=str, default="RAMPED_HALF_HALF", + help="Random initialization method.", choices=["GROW", "FULL", "RAMPED_HALF_HALF"]) + + # NCES only + parser.add_argument("--learner_name", type=str, default="SetTransformer", help="Learner name.", + choices=["SetTransformer", "GRU", "LSTM"]) + parser.add_argument("--proj_dim", type=int, default=128, help="Number of projection dimensions.") + parser.add_argument("--rnn_n_layers", type=int, default=2, help="Number of RNN layers (only for LSTM and GRU).") + parser.add_argument("--drop_prob", type=float, default=0.1, help="Drop probability.") + parser.add_argument("--num_heads", type=int, default=4, help="Number of heads") + parser.add_argument("--num_seeds", type=int, default=1, help="Number of seeds (only for SetTransformer).") + parser.add_argument("--num_inds", type=int, default=32, help="Number of inducing points (only for SetTransformer).") + parser.add_argument("--ln", type=bool, default=False, help="Layer normalization (only for SetTransformer).") + parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate.") + parser.add_argument("--decay_rate", type=int, default=0, help="Decay rate.") + parser.add_argument("--clip_value", type=int, default=5, help="Clip value.") + parser.add_argument("--batch_size", type=int, default=256, help="Batch size") + parser.add_argument("--num_workers", type=int, default=8, help="Number of workers") + parser.add_argument("--max_length", type=int, default=48, help="Maximum length") + parser.add_argument("--load_pretrained", type=bool, default=True, help="Load pretrained.") + parser.add_argument("--sorted_examples", type=bool, default=True, help="Sorted examples.") + parser.add_argument("--pretrained_model_name", type=str, default="SetTransformer", help="Pretrained model name", + choices=["SetTransformer", "GRU", "LSTM"]) + + if description is None: + return parser.parse_args() + return parser.parse_args(description) + + +if __name__ == '__main__': + execute(get_default_arguments()) diff --git a/ontolearn/model_adapter.py b/ontolearn/model_adapter.py index e6631aef..1b526867 100644 --- a/ontolearn/model_adapter.py +++ b/ontolearn/model_adapter.py @@ -1,17 +1,65 @@ """Model adapters.""" import inspect +import json import logging +import re from typing import TypeVar, List, Optional, Union - +from owlapy.render import DLSyntaxObjectRenderer from ontolearn.abstracts import AbstractHeuristic, AbstractScorer, BaseRefinement, AbstractKnowledgeBase, \ AbstractNode from ontolearn.base_concept_learner import BaseConceptLearner -from owlapy.model import OWLReasoner, OWLNamedIndividual, OWLClassExpression, OWLAxiom +from owlapy.model import OWLReasoner, OWLNamedIndividual, OWLClassExpression, OWLAxiom, IRI from ontolearn.base import OWLReasoner_Owlready2_ComplexCEInstances +from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, NCES, Drill +from ontolearn.ea_algorithms import EASimple +from ontolearn.ea_initialization import EARandomWalkInitialization, EARandomInitialization, RandomInitMethod +from ontolearn.fitness_functions import LinearPressureFitness +from ontolearn.heuristics import CELOEHeuristic, OCELHeuristic +from ontolearn.knowledge_base import KnowledgeBase +from ontolearn.learning_problem import PosNegLPStandard +from ontolearn.refinement_operators import ModifiedCELOERefinement +from ontolearn.metrics import Accuracy, F1, Recall, Precision, WeightedAccuracy +from ontolearn.value_splitter import BinningValueSplitter, EntropyValueSplitter logger = logging.getLogger(__name__) -# TODO:CD: Move all imports to the top of the file +metrics = {'f1': F1, + 'accuracy': Accuracy, + 'recall': Recall, + 'precision': Precision, + 'weighted_accuracy': WeightedAccuracy + } + +models = {'celoe': CELOE, + 'ocel': OCEL, + 'evolearner': EvoLearner, + 'nces': NCES, + 'drill': Drill} + +heuristics = {'celoe': CELOEHeuristic, + 'ocel': OCELHeuristic} + +def transform_string(input_string): + """Used to turn camelCase arguments to snake_case""" + # Use regex to find all capital letters C and replace them with '_C' + transformed_string = re.sub(r'([A-Z])', r'_\1', input_string).lower() + + # Remove the leading underscore if it exists + transformed_string = transformed_string.lstrip('_') + + return transformed_string + + +def compute_quality(KB, solution, pos, neg, qulaity_func="f1"): + func = metrics[qulaity_func]().score2 + instances = set(KB.individuals(solution)) + if isinstance(list(pos)[0], str): + instances = {ind.get_iri().as_str().split("/")[-1] for ind in instances} + tp = len(pos.intersection(instances)) + fn = len(pos.difference(instances)) + fp = len(neg.intersection(instances)) + tn = len(neg.difference(instances)) + return func(tp=tp, fn=fn, fp=fp, tn=tn)[-1] def _get_matching_opts(_Type, optargs, kwargs, *, prefix=None): """Find the keys in kwargs that are parameters of _Type. @@ -37,6 +85,8 @@ def p(s): for opt in sig: if p(opt) in kwargs: opts[opt] = kwargs.pop(p(opt)) + elif transform_string(p(opt)) in kwargs: + opts[opt] = kwargs.pop(transform_string(p(opt))) elif opt in optargs: opts[opt] = optargs[opt] return opts @@ -85,7 +135,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 else: kb_type = kwargs.pop("knowledge_base_type", None) if kb_type is None: - from ontolearn.knowledge_base import KnowledgeBase kb_type = KnowledgeBase else: kb_type = kb_type @@ -100,7 +149,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 assert isinstance(kb, AbstractKnowledgeBase) if "ignore" in kwargs: - from ontolearn.knowledge_base import KnowledgeBase assert isinstance(kb, KnowledgeBase) target_kb = kb.ignore_and_copy(ignored_classes=kwargs.pop("ignore")) else: @@ -128,7 +176,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 else: op_type = kwargs.pop("refinement_operator_type", None) if op_type is None: - from ontolearn.refinement_operators import ModifiedCELOERefinement op_type = ModifiedCELOERefinement assert issubclass(op_type, BaseRefinement) operator = op_type(**_get_matching_opts( @@ -144,7 +191,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 else: quality_type = kwargs.pop("quality_type", None) if quality_type is None: - from ontolearn.metrics import F1 quality_type = F1 assert issubclass(quality_type, AbstractScorer) qual = quality_type(**_get_matching_opts(quality_type, {}, kwargs)) @@ -157,7 +203,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 else: heuristic_type = kwargs.pop("heuristic_type", None) if heuristic_type is None: - from ontolearn.heuristics import CELOEHeuristic heuristic_type = CELOEHeuristic assert issubclass(heuristic_type, AbstractHeuristic) heur = heuristic_type(**_get_matching_opts(heuristic_type, {}, kwargs)) @@ -171,7 +216,6 @@ def ModelAdapter(*args, **kwargs): # noqa: C901 else: learner_type = kwargs.pop("learner_type", None) if learner_type is None: - from ontolearn.concept_learner import CELOE learner_type = CELOE assert issubclass(learner_type, BaseConceptLearner) learner_args = _get_matching_opts(learner_type, {}, kwargs) @@ -289,3 +333,63 @@ def save_best_hypothesis(self, n: int = 10, path: str = 'Predictions', rdf_forma rdf_format: Serialisation format. currently supported: "rdfxml". """ self.learner.save_best_hypothesis(n, path, rdf_format) + + +def execute(args): + + args_d = args.__dict__ + learner_type = models[args.model] + optargs = {} + if args.sparql_endpoint: + kb = KnowledgeBase(triplestore_address=args.sparql_endpoint) + else: + kb = KnowledgeBase(path=args.knowledge_base_path) + + with open(args.path_learning_problem) as json_file: + examples = json.load(json_file) + pos = set(map(OWLNamedIndividual, map(IRI.create, set(examples['positive_examples'])))) + neg = set(map(OWLNamedIndividual, map(IRI.create, set(examples['negative_examples'])))) + lp = PosNegLPStandard(pos=pos, neg=neg) + + if args.model in ["celoe", "ocel"]: + heur_func = heuristics[args.model](**_get_matching_opts(heuristics[args.model], {}, args_d)) + refinement_op = ModifiedCELOERefinement(**_get_matching_opts(ModifiedCELOERefinement, + {"knowledge_base": kb, + "value_splitter": BinningValueSplitter(args.max_nr_splits)}, + args_d)) + optargs = {"knowledge_base": kb, + "quality_func": metrics[args.quality_metric](), + "heuristic_func": heur_func, + "refinement_operator": refinement_op} + elif args.model == "evolearner": + fit_func = LinearPressureFitness(**_get_matching_opts(LinearPressureFitness, {}, args_d)) + init_rw_method = EARandomWalkInitialization(**_get_matching_opts(EARandomWalkInitialization, {}, args_d)) + algorithm = EASimple(**_get_matching_opts(EASimple, {}, args_d)) + mut_uniform_gen = EARandomInitialization(**_get_matching_opts( + EARandomInitialization, {"method": getattr(RandomInitMethod, args.init_method_type)}, args_d)) + value_splitter = EntropyValueSplitter(**_get_matching_opts(EntropyValueSplitter, {}, args_d)) + + optargs = {"knowledge_base": kb, + "quality_func": metrics[args.quality_metric](), + "fitness_func": fit_func, + "init_method": init_rw_method, + "algorithm": algorithm, + "mut_uniform_gen": mut_uniform_gen, + "value_splitter": value_splitter} + elif args.model == "drill": + optargs = {"knowledge_base": kb, + "quality_func": metrics[args.quality_metric]()} + + model = learner_type(**_get_matching_opts(learner_type, optargs, args_d)) + + if args.model in ["celoe", "evolearner", "ocel"]: + trainer = Trainer(model, kb.reasoner()) + trainer.fit(lp) + print(list(trainer.best_hypotheses(1)).pop()) + + elif args.model in ["nces"]: + hypothesis = model.fit(pos, neg) + report = "Prediction: " + DLSyntaxObjectRenderer().render(hypothesis) + "Quality: " + \ + compute_quality(kb, hypothesis, pos, neg, args.quality_metric) + "Individuals: " + \ + kb.individuals_count(hypothesis) + print(report) diff --git a/setup.py b/setup.py index 81cf7685..6cc27ca9 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", "Topic :: Scientific/Engineering :: Artificial Intelligence"], python_requires='>=3.8', + entry_points={"console_scripts": ["ontolearn = ontolearn.run:main"]}, long_description=long_description, long_description_content_type="text/markdown", ) From ee2f8c60e7bfd280b99756c76241585261e204d5 Mon Sep 17 00:00:00 2001 From: Alkid Date: Mon, 4 Dec 2023 14:20:57 +0100 Subject: [PATCH 05/31] Minor refactoring --- main.py | 2 +- ontolearn/model_adapter.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 769f6007..faaa3242 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ def get_default_arguments(description=None): parser = ArgumentParser() - parser.add_argument("--model", type=str, default="nces", choices=["celoe", "ocel", "evolearner", "nces"], + parser.add_argument("--model", type=str, default="celoe", choices=["celoe", "ocel", "evolearner", "nces"], help="Available concept learning models.") # Knowledge graph related arguments diff --git a/ontolearn/model_adapter.py b/ontolearn/model_adapter.py index 1b526867..c9100336 100644 --- a/ontolearn/model_adapter.py +++ b/ontolearn/model_adapter.py @@ -376,9 +376,9 @@ def execute(args): "algorithm": algorithm, "mut_uniform_gen": mut_uniform_gen, "value_splitter": value_splitter} - elif args.model == "drill": - optargs = {"knowledge_base": kb, - "quality_func": metrics[args.quality_metric]()} + # elif args.model == "drill": + # optargs = {"knowledge_base": kb, + # "quality_func": metrics[args.quality_metric]()} model = learner_type(**_get_matching_opts(learner_type, optargs, args_d)) From 8fe777b367d8efaae3c1cca08fe21532bcdb6448 Mon Sep 17 00:00:00 2001 From: Alkid Date: Mon, 4 Dec 2023 14:33:44 +0100 Subject: [PATCH 06/31] Fixed bug on always false condition --- ontolearn/abstracts.py | 2 +- ontolearn/utils/__init__.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ontolearn/abstracts.py b/ontolearn/abstracts.py index 91997a1c..445b5ef6 100644 --- a/ontolearn/abstracts.py +++ b/ontolearn/abstracts.py @@ -467,7 +467,7 @@ def __init__(self, path_of_embeddings, reward_func, learning_rate=None, num_epochs_per_replay=None, num_workers=None, verbose=0): self.name = 'DRILL' self.instance_embeddings = read_csv(path_of_embeddings) - if self.instance_embeddings is None: + if not self.instance_embeddings: print("No embeddings found") self.embedding_dim = None else: diff --git a/ontolearn/utils/__init__.py b/ontolearn/utils/__init__.py index 67ca4cfd..2c0e6372 100644 --- a/ontolearn/utils/__init__.py +++ b/ontolearn/utils/__init__.py @@ -119,18 +119,20 @@ def read_csv(path)->Union[None,pd.DataFrame]: else: return None -def assertion_path_isfile(path) -> None: + +def assertion_path_isfile(path) -> bool: try: assert path is not None except AssertionError: print(f'Path can not be:{path}') - return None + return False try: assert os.path.isfile(path) except (AssertionError, TypeError): print(f'Input:{path} not found.') - return None + return False + return True def sanity_checking_args(args): From 057b73a909a2aa8913c7c33057d1ade3cb2de680 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 11:01:11 +0100 Subject: [PATCH 07/31] Errors are pointed out --- ontolearn/model_adapter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ontolearn/model_adapter.py b/ontolearn/model_adapter.py index c9100336..b0ea2126 100644 --- a/ontolearn/model_adapter.py +++ b/ontolearn/model_adapter.py @@ -385,10 +385,14 @@ def execute(args): if args.model in ["celoe", "evolearner", "ocel"]: trainer = Trainer(model, kb.reasoner()) trainer.fit(lp) - print(list(trainer.best_hypotheses(1)).pop()) + # @TODO:CD: If n=1, best_hypotheses must return an object containing a DL concept (e.g. OENODE), otherwise + # @TODO:CD a list of objects each of which contains a DL concept + # best_dl_concept=trainer.best_hypotheses(1) + # best_dl_concepts=trainer.best_hypotheses(10) elif args.model in ["nces"]: hypothesis = model.fit(pos, neg) + # @TODO:CD: model.fit() should return a train model itself, not predictions report = "Prediction: " + DLSyntaxObjectRenderer().render(hypothesis) + "Quality: " + \ compute_quality(kb, hypothesis, pos, neg, args.quality_metric) + "Individuals: " + \ kb.individuals_count(hypothesis) From e760da30c3e05f24b148dd4c0e8b3625577742bc Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 17:10:39 +0100 Subject: [PATCH 08/31] WIP:ontolearn/learners --- examples/concept_learning_evaluation.py | 3 +- ontolearn/__init__.py | 2 + ontolearn/concept_learner.py | 823 ----------------------- ontolearn/learners/__init__.py | 1 + ontolearn/learners/drill.py | 838 ++++++++++++++++++++++++ ontolearn/learners/nero.py | 9 + 6 files changed, 852 insertions(+), 824 deletions(-) create mode 100644 ontolearn/learners/__init__.py create mode 100644 ontolearn/learners/drill.py create mode 100644 ontolearn/learners/nero.py diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index e715fde1..0b465ff3 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -3,7 +3,8 @@ import time import pandas as pd from ontolearn.knowledge_base import KnowledgeBase -from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, Drill +from ontolearn.concept_learner import CELOE, OCEL, EvoLearner +from ontolearn import Drill from ontolearn.learning_problem import PosNegLPStandard from ontolearn.metrics import Accuracy, F1 from owlapy.model import OWLClass, OWLNamedIndividual, IRI diff --git a/ontolearn/__init__.py b/ontolearn/__init__.py index b000c335..bfb401a7 100644 --- a/ontolearn/__init__.py +++ b/ontolearn/__init__.py @@ -16,3 +16,5 @@ # from .metrics import * # from .search import * __all__ = ['knowledge_base', 'abstracts', 'base_concept_learner', 'metrics', 'search'] + +from .learners import Drill \ No newline at end of file diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index c10d7943..3a5b0c91 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -48,7 +48,6 @@ from owlapy.util import OrderedOWLObject from sortedcontainers import SortedSet import os -from .data_struct import Experience logger = logging.getLogger(__name__) _concept_operand_sorter = ConceptOperandSorter() @@ -650,828 +649,6 @@ def make_node(self, c: OWLClassExpression, parent_node: Optional[OENode] = None, return r -class Drill(RefinementBasedConceptLearner): - """Deep Reinforcement Learning for Refinement Operators in ALC.""" - - def __init__(self, knowledge_base, - path_of_embeddings: str = None, refinement_operator: LengthBasedRefinement = None, - use_inverse=True, - use_data_properties=True, - use_card_restrictions=True, - card_limit=10, - quality_func: AbstractScorer = None, - reward_func=None, - batch_size=None, num_workers=None, pretrained_model_name=None, - iter_bound=None, max_num_of_concepts_tested=None, verbose=None, terminate_on_goal=None, - max_len_replay_memory=None, epsilon_decay=None, epsilon_min=None, num_epochs_per_replay=None, - num_episodes_per_replay=None, learning_rate=None, max_runtime=None, num_of_sequential_actions=None, - num_episode=None): - - print("***DRILL has not yet been fully integrated***") - self.name = "DRILL" - # TODO: Clear difference between training and testing should be defined at init - if refinement_operator is None: - refinement_operator = LengthBasedRefinement(knowledge_base=knowledge_base, - use_data_properties=use_data_properties, - use_card_restrictions=use_card_restrictions, - card_limit=card_limit, - use_inverse=use_inverse) - self.reward_func = reward_func - self.representation_mode = "averaging" - self.heuristic_func = None - self.num_workers = num_workers - self.epsilon = 1 - self.learning_rate = .001 - self.num_episode = 1 - self.num_of_sequential_actions = 3 - self.num_epochs_per_replay = 1 - self.max_len_replay_memory = 256 - self.epsilon_decay = 0.01 - self.epsilon_min = 0 - self.batch_size = 1024 - self.verbose = 0 - self.num_episodes_per_replay = 2 - self.seen_examples = dict() - self.emb_pos, self.emb_neg = None, None - self.start_time = None - self.goal_found = False - self.experiences = Experience(maxlen=self.max_len_replay_memory) - - if path_of_embeddings is not None and os.path.exists(path_of_embeddings): - self.instance_embeddings = pd.read_csv(path_of_embeddings) - self.embedding_dim = self.instance_embeddings.shape[1] - else: - self.instance_embeddings = None - self.embedding_dim = 12 - - self.sample_size = 1 - arg_net = {'input_shape': (4 * self.sample_size, self.embedding_dim), - 'first_out_channels': 32, 'second_out_channels': 16, 'third_out_channels': 8, - 'kernel_size': 3} - self.heuristic_func = DrillHeuristic(mode='averaging', model_args=arg_net) - if self.learning_rate: - self.optimizer = torch.optim.Adam(self.heuristic_func.net.parameters(), lr=self.learning_rate) - - if pretrained_model_name: - self.pre_trained_model_loaded = True - self.heuristic_func.net.load_state_dict(torch.load(pretrained_model_name, torch.device('cpu'))) - else: - self.pre_trained_model_loaded = False - - RefinementBasedConceptLearner.__init__(self, knowledge_base=knowledge_base, - refinement_operator=refinement_operator, - quality_func=quality_func, - heuristic_func=self.heuristic_func, - terminate_on_goal=terminate_on_goal, - iter_bound=iter_bound, - max_num_of_concepts_tested=max_num_of_concepts_tested, - max_runtime=max_runtime) - print('Number of parameters: ', sum([p.numel() for p in self.heuristic_func.net.parameters()])) - - self.search_tree = DRILLSearchTreePriorityQueue() - self._learning_problem = None - self.storage_path, _ = create_experiment_folder() - - def best_hypotheses(self, n=1): - assert self.search_tree is not None - assert len(self.search_tree) > 1 - if n == 1: - return [i for i in self.search_tree.get_top_n_nodes(n)][0] - else: - return [i for i in self.search_tree.get_top_n_nodes(n)] - - def clean(self): - self.emb_pos, self.emb_neg = None, None - self.goal_found = False - self.start_time = None - if len(self.search_tree) != 0: - self.search_tree.clean() - - try: - assert len(self.search_tree) == 0 - except AssertionError: - print(len(self.search_tree)) - raise AssertionError('EMPTY search tree') - - self._number_of_tested_concepts = 0 - - def downward_refinement(self, *args, **kwargs): - ValueError('downward_refinement') - - def next_node_to_expand(self) -> RL_State: - """ Return a node that maximizes the heuristic function at time t. """ - return self.search_tree.get_most_promising() - - def initialize_class_expression_learning_problem(self, pos: Set[OWLNamedIndividual], neg: Set[OWLNamedIndividual]): - """ - Determine the learning problem and initialize the search. - 1) Convert the string representation of an individuals into the owlready2 representation. - 2) Sample negative examples if necessary. - 3) Initialize the root and search tree. - """ - self.clean() - assert 0 < len(pos) and 0 < len(neg) - - # 1. - # Generate a Learning Problem - self._learning_problem = PosNegLPStandard(pos=set(pos), neg=set(neg)).encode_kb(self.kb) - # 2. Obtain embeddings of positive and negative examples. - if self.instance_embeddings is None: - self.emb_pos = None - self.emb_neg = None - else: - self.emb_pos = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos]].values, - dtype=torch.float32) - self.emb_neg = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg]].values, - dtype=torch.float32) - - # (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini batching. - self.emb_pos = torch.mean(self.emb_pos, dim=0) - self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) - self.emb_neg = torch.mean(self.emb_neg, dim=0) - self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) - # Sanity checking - if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): - raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) - if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): - raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) - - # Initialize ROOT STATE - root_rl_state = self.create_rl_state(self.start_class, is_root=True) - self.compute_quality_of_class_expression(root_rl_state) - return root_rl_state - - def fit(self, lp: PosNegLPStandard, max_runtime=None): - if max_runtime: - assert isinstance(max_runtime, int) - self.max_runtime = max_runtime - # @TODO: Type injection should be possible for all - pos_type_counts = Counter( - [i for i in chain.from_iterable((self.kb.get_types(ind, direct=True) for ind in lp.pos))]) - neg_type_counts = Counter( - [i for i in chain.from_iterable((self.kb.get_types(ind, direct=True) for ind in lp.neg))]) - type_bias = pos_type_counts - neg_type_counts - - # (1) Initialize learning problem - root_state = self.initialize_class_expression_learning_problem(pos=lp.pos, neg=lp.neg) - # (2) Add root state into search tree - root_state.heuristic = root_state.quality - self.search_tree.add(root_state) - # (3) Inject Type Bias - for x in (self.create_rl_state(i, parent_node=root_state) for i in type_bias): - self.compute_quality_of_class_expression(x) - x.heuristic = x.quality - self.search_tree.add(x) - self.start_time = time.time() - # (3) Search - for i in range(1, self.iter_bound): - # (1) Get the most fitting RL-state - most_promising = self.next_node_to_expand() - next_possible_states = [] - # (2) Refine (1) - for ref in self.apply_refinement(most_promising): - # (2.1) If the next possible RL-state is not a dead end - # (2.1.) If the refinement of (1) is not equivalent to \bottom - if len(ref.instances): - # Compute quality - self.compute_quality_of_class_expression(ref) - if ref.quality == 0: - continue - next_possible_states.append(ref) - if ref.quality == 1.0: - break - try: - assert len(next_possible_states) > 0 - except AssertionError: - if self.verbose > 1: - logger.info(f'DEAD END at {most_promising}') - continue - if len(next_possible_states) == 0: - # We do not need to compute Q value based on embeddings of "zeros". - continue - - if self.pre_trained_model_loaded is True: - preds = self.predict_values(current_state=most_promising, next_states=next_possible_states) - else: - preds = None - self.goal_found = self.update_search(next_possible_states, preds) - if self.goal_found: - if self.terminate_on_goal: - return self.terminate() - if time.time() - self.start_time > self.max_runtime: - return self.terminate() - - def show_search_tree(self, heading_step: str, top_n: int = 10) -> None: - ValueError('show_search_tree') - - def terminate_training(self): - ValueError('terminate_training') - - def fit_from_iterable(self, - dataset: List[Tuple[object, Set[OWLNamedIndividual], Set[OWLNamedIndividual]]], - max_runtime: int = None) -> List: - """ - Dataset is a list of tuples where the first item is either str or OWL class expression indicating target - concept. - """ - if max_runtime: - self.max_runtime = max_runtime - renderer = DLSyntaxObjectRenderer() - - results = [] - for (target_ce, p, n) in dataset: - if self.verbose > 0: - logger.info(f'TARGET OWL CLASS EXPRESSION:\n{target_ce}') - logger.info(f'|Sampled Positive|:{len(p)}\t|Sampled Negative|:{len(n)}') - start_time = time.time() - self.fit(pos=p, neg=n, max_runtime=max_runtime) - rn = time.time() - start_time - h: RL_State = next(iter(self.best_hypotheses())) - # TODO:CD: We need to remove this first returned boolean for the sake of readability. - _, f_measure = F1().score_elp(instances=h.instances_bitset, learning_problem=self._learning_problem) - _, accuracy = Accuracy().score_elp(instances=h.instances_bitset, learning_problem=self._learning_problem) - - report = {'Target': str(target_ce), - 'Prediction': renderer.render(h.concept), - 'F-measure': f_measure, - 'Accuracy': accuracy, - 'NumClassTested': self._number_of_tested_concepts, - 'Runtime': rn} - results.append(report) - - return results - - def init_training(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual]) -> None: - """ - Initialize training. - """ - """ (1) Generate a Learning Problem """ - self._learning_problem = PosNegLPStandard(pos=pos_uri, neg=neg_uri).encode_kb(self.kb) - """ (2) Update REWARD FUNC FOR each learning problem """ - self.reward_func.lp = self._learning_problem - """ (3) Obtain embeddings of positive and negative examples """ - self.emb_pos = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos_uri]].values, - dtype=torch.float32) - self.emb_neg = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg_uri]].values, - dtype=torch.float32) - """ (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini - batching """ - self.emb_pos = torch.mean(self.emb_pos, dim=0) - self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) - self.emb_neg = torch.mean(self.emb_neg, dim=0) - self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) - # Sanity checking - if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): - raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) - if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): - raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) - - # Default exploration exploitation tradeoff. - """ (3) Default exploration exploitation tradeoff and number of expression tested """ - self.epsilon = 1 - self._number_of_tested_concepts = 0 - - def create_rl_state(self, c: OWLClassExpression, parent_node: Optional[RL_State] = None, - is_root: bool = False) -> RL_State: - """ Create an RL_State instance.""" - # Create State - rl_state = RL_State(c, parent_node=parent_node, is_root=is_root) - # Assign Embeddings to it. Later, assign_embeddings can be also done in RL_STATE - self.assign_embeddings(rl_state) - rl_state.length = self.kb.concept_len(c) - return rl_state - - def compute_quality_of_class_expression(self, state: RL_State) -> None: - """ Compute Quality of owl class expression.""" - self.quality_func.apply(state, state.instances_bitset, self._learning_problem) - self._number_of_tested_concepts += 1 - - def apply_refinement(self, rl_state: RL_State) -> Generator: - """ - Refine an OWL Class expression \\|= Observing next possible states. - - 1. Generate concepts by refining a node. - 1.1. Compute allowed length of refinements. - 1.2. Convert concepts if concepts do not belong to self.concepts_to_ignore. - Note that i.str not in self.concepts_to_ignore => O(1) if a set is being used. - 3. Return Generator. - """ - assert isinstance(rl_state, RL_State) - # 1. - for i in self.operator.refine(rl_state.concept): # O(N) - yield self.create_rl_state(i, parent_node=rl_state) - - def learn_from_illustration(self, sequence_of_goal_path: List[RL_State]): - """ - Args: - sequence_of_goal_path: ⊤,Parent,Parent ⊓ Daughter. - """ - current_state = sequence_of_goal_path.pop(0) - rewards = [] - sequence_of_states = [] - while len(sequence_of_goal_path) > 0: - self.assign_embeddings(current_state) - current_state.length = self.kb.concept_len(current_state.concept) - if current_state.quality is None: - self.compute_quality_of_class_expression(current_state) - - next_state = sequence_of_goal_path.pop(0) - self.assign_embeddings(next_state) - next_state.length = self.kb.concept_len(next_state.concept) - if next_state.quality is None: - self.compute_quality_of_class_expression(next_state) - sequence_of_states.append((current_state, next_state)) - rewards.append(self.reward_func.apply(current_state, next_state)) - for x in range(2): - self.form_experiences(sequence_of_states, rewards) - self.learn_from_replay_memory() - - def rl_learning_loop(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual], - goal_path: List[RL_State] = None) -> List[float]: - """ - Standard RL training loop. - - 1. Initialize RL environment for training. - - 2. Learn from an illustration if possible. - 2. Training Loop. - """ - """ (1) Initialize RL environment for training """ - self.init_training(pos_uri=pos_uri, neg_uri=neg_uri) - root_rl_state = self.create_rl_state(self.start_class, is_root=True) - self.compute_quality_of_class_expression(root_rl_state) - sum_of_rewards_per_actions = [] - log_every_n_episodes = int(self.num_episode * .1) + 1 - """ (2) Learn from an illustration if possible """ - if goal_path: - self.learn_from_illustration(goal_path) - - """ (3) Reinforcement Learning offline training loop """ - for th in range(self.num_episode): - """ (3.1) Sequence of decisions """ - sequence_of_states, rewards = self.sequence_of_actions(root_rl_state) - - if self.verbose >= 10: - logger.info('#' * 10, end='') - logger.info(f'{th}\t.th Sequence of Actions', end='') - logger.info('#' * 10) - for step, (current_state, next_state) in enumerate(sequence_of_states): - logger.info(f'{step}. Transition \n{current_state}\n----->\n{next_state}') - logger.info(f'Reward:{rewards[step]}') - - if th % log_every_n_episodes == 0: - if self.verbose >= 1: - logger.info('{0}.th iter. SumOfRewards: {1:.2f}\t' - 'Epsilon:{2:.2f}\t' - '|ReplayMem.|:{3}'.format(th, sum(rewards), - self.epsilon, - len(self.experiences))) - """(3.2) Form experiences""" - self.form_experiences(sequence_of_states, rewards) - sum_of_rewards_per_actions.append(sum(rewards)) - """(3.2) Learn from experiences""" - if th % self.num_episodes_per_replay == 0: - self.learn_from_replay_memory() - """(3.4) Exploration Exploitation""" - if self.epsilon < 0: - break - self.epsilon -= self.epsilon_decay - - return sum_of_rewards_per_actions - - def sequence_of_actions(self, root_rl_state: RL_State) -> Tuple[List[Tuple[AbstractNode, AbstractNode]], - List[SupportsFloat]]: - assert isinstance(root_rl_state, RL_State) - - current_state = root_rl_state - path_of_concepts = [] - rewards = [] - - assert len(current_state.embeddings) > 0 # Embeddings are initialized - assert current_state.quality > 0 - assert current_state.heuristic is None - - # (1) - for _ in range(self.num_of_sequential_actions): - assert isinstance(current_state, RL_State) - # (1.1) Observe Next RL states, i.e., refine an OWL class expression - next_rl_states = list(self.apply_refinement(current_state)) - # (1.2) - if len(next_rl_states) == 0: # DEAD END - # assert (current_state.length + 3) <= self.max_child_length - print('No next state') - break - # (1.3) - next_selected_rl_state = self.exploration_exploitation_tradeoff(current_state, next_rl_states) - # (1.4) Remember the concept path - path_of_concepts.append((current_state, next_selected_rl_state)) - # (1.5) - rewards.append(self.reward_func.apply(current_state, next_selected_rl_state)) - # (1.6) - current_state = next_selected_rl_state - return path_of_concepts, rewards - - def form_experiences(self, state_pairs: List, rewards: List) -> None: - """ - Form experiences from a sequence of concepts and corresponding rewards. - - state_pairs - A list of tuples containing two consecutive states. - reward - A list of reward. - - Gamma is 1. - - Return - X - A list of embeddings of current concept, next concept, positive examples, negative examples. - y - Argmax Q value. - """ - - if self.verbose > 1: - print('Form Experiences for the training') - - for th, consecutive_states in enumerate(state_pairs): - e, e_next = consecutive_states - self.experiences.append( - (e, e_next, max(rewards[th:]))) # given e, e_next, Q val is the max Q value reachable. - - def learn_from_replay_memory(self) -> None: - """ - Learning by replaying memory. - """ - if self.verbose > 1: - print('Learn from Experience') - - current_state_batch, next_state_batch, q_values = self.experiences.retrieve() - current_state_batch = torch.cat(current_state_batch, dim=0) - next_state_batch = torch.cat(next_state_batch, dim=0) - q_values = torch.Tensor(q_values) - - try: - assert current_state_batch.shape[1] == next_state_batch.shape[1] == self.emb_pos.shape[1] == \ - self.emb_neg.shape[1] - - except AssertionError as e: - print(current_state_batch.shape) - print(next_state_batch.shape) - print(self.emb_pos.shape) - print(self.emb_neg.shape) - print('Wrong format.') - print(e) - raise - - assert current_state_batch.shape[2] == next_state_batch.shape[2] == self.emb_pos.shape[2] == self.emb_neg.shape[ - 2] - dataset = PrepareBatchOfTraining(current_state_batch=current_state_batch, - next_state_batch=next_state_batch, - p=self.emb_pos, n=self.emb_neg, q=q_values) - num_experience = len(dataset) - data_loader = torch.utils.data.DataLoader(dataset, - batch_size=self.batch_size, shuffle=True, - num_workers=self.num_workers) - if self.verbose > 1: - print(f'Number of experiences:{num_experience}') - print('DQL agent is learning via experience replay') - self.heuristic_func.net.train() - for m in range(self.num_epochs_per_replay): - total_loss = 0 - for X, y in data_loader: - self.optimizer.zero_grad() # zero the gradient buffers - # forward - predicted_q = self.heuristic_func.net.forward(X) - # loss - loss = self.heuristic_func.net.loss(predicted_q, y) - total_loss += loss.item() - # compute the derivative of the loss w.r.t. the parameters using backpropagation - loss.backward() - # clip gradients if gradients are killed. =>torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) - self.optimizer.step() - if self.verbose > 1: - print(f'{m}.th Epoch average loss during training:{total_loss / num_experience}') - - self.heuristic_func.net.train().eval() - - def update_search(self, concepts, predicted_Q_values=None): - """ - @param concepts: - @param predicted_Q_values: - @return: - """ - if predicted_Q_values is not None: - for child_node, pred_Q in zip(concepts, predicted_Q_values): - child_node.heuristic = pred_Q - if child_node.quality > 0: # > too weak, ignore. - self.search_tree.add(child_node) - if child_node.quality == 1: - return child_node - else: - for child_node in concepts: - child_node.heuristic = child_node.quality - if child_node.quality > 0: # > too weak, ignore. - self.search_tree.add(child_node) - if child_node.quality == 1: - return child_node - - def assign_embeddings(self, rl_state: RL_State) -> None: - """ - Assign embeddings to a rl state. A rl state is represented with vector representation of - all individuals belonging to a respective OWLClassExpression. - """ - assert isinstance(rl_state, RL_State) - # (1) Detect mode of representing OWLClassExpression - if self.representation_mode == 'averaging': - # (2) if input node has not seen before, assign embeddings. - if rl_state.embeddings is None: - assert isinstance(rl_state.concept, OWLClassExpression) - # (3) Retrieval instances via our retrieval function (R(C)). Be aware Open World and Closed World - # Assumption - rl_state.instances = set(self.kb.individuals(rl_state.concept)) - # (4) Retrieval instances in terms of bitset. - rl_state.instances_bitset = self.kb.individuals_set(rl_state.concept) - # (5) |R(C)|=\emptyset ? - if len(rl_state.instances) == 0: - # If|R(C)|=\emptyset, then represent C with zeros - if self.instance_embeddings is not None: - emb = torch.zeros(1, self.sample_size, self.instance_embeddings.shape[1]) - else: - emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) - else: - # If|R(C)| \not= \emptyset, then take the mean of individuals. - str_idx = [i.get_iri().as_str() for i in rl_state.instances] - assert len(str_idx) > 0 - if self.instance_embeddings is not None: - emb = torch.tensor(self.instance_embeddings.loc[str_idx].values, dtype=torch.float32) - emb = torch.mean(emb, dim=0) - emb = emb.view(1, self.sample_size, self.instance_embeddings.shape[1]) - else: - emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) - # (6) Assign embeddings - rl_state.embeddings = emb - else: - """ Embeddings already assigned.""" - try: - assert rl_state.embeddings.shape == (1, self.sample_size, self.embedding_dim) - except AssertionError as e: - print(e) - print(rl_state) - print(rl_state.embeddings.shape) - print((1, self.sample_size, self.instance_embeddings.shape[1])) - raise - elif self.representation_mode == 'sampling': - raise NotImplementedError('Sampling technique for state representation is not implemented.') - """ - if node.embeddings is None: - str_idx = [get_full_iri(i).replace('\n', '') for i in node.concept.instances] - if len(str_idx) >= self.sample_size: - sampled_str_idx = random.sample(str_idx, self.sample_size) - emb = torch.tensor(self.instance_embeddings.loc[sampled_str_idx].values, dtype=torch.float32) - else: - num_rows_to_fill = self.sample_size - len(str_idx) - emb = torch.tensor(self.instance_embeddings.loc[str_idx].values, dtype=torch.float32) - emb = torch.cat((torch.zeros(num_rows_to_fill, self.instance_embeddings.shape[1]), emb)) - emb = emb.view(1, self.sample_size, self.instance_embeddings.shape[1]) - node.embeddings = emb - else: - try: - assert node.embeddings.shape == (1, self.sample_size, self.instance_embeddings.shape[1]) - except AssertionError: - print(node) - print(self.sample_size) - print(node.embeddings.shape) - print((1, self.sample_size, self.instance_embeddings.shape[1])) - raise ValueError - """ - else: - raise ValueError - - # @todo remove this testing in experiments. - if torch.isnan(rl_state.embeddings).any() or torch.isinf(rl_state.embeddings).any(): - # No individual contained in the input concept. - # Sanity checking. - raise ValueError - - def save_weights(self): - """ - Save pytorch weights. - """ - # Save model. - torch.save(self.heuristic_func.net.state_dict(), - self.storage_path + '/{0}.pth'.format(self.heuristic_func.name)) - - def exploration_exploitation_tradeoff(self, current_state: AbstractNode, - next_states: List[AbstractNode]) -> AbstractNode: - """ - Exploration vs Exploitation tradeoff at finding next state. - (1) Exploration. - (2) Exploitation. - """ - if np.random.random() < self.epsilon: - next_state = random.choice(next_states) - self.assign_embeddings(next_state) - else: - next_state = self.exploitation(current_state, next_states) - self.compute_quality_of_class_expression(next_state) - return next_state - - def exploitation(self, current_state: AbstractNode, next_states: List[AbstractNode]) -> AbstractNode: - """ - Find next node that is assigned with highest predicted Q value. - - (1) Predict Q values : predictions.shape => torch.Size([n, 1]) where n = len(next_states). - - (2) Find the index of max value in predictions. - - (3) Use the index to obtain next state. - - (4) Return next state. - """ - predictions: torch.Tensor = self.predict_values(current_state, next_states) - argmax_id = int(torch.argmax(predictions)) - next_state = next_states[argmax_id] - """ - # Sanity checking - print('#'*10) - for s, q in zip(next_states, predictions): - print(s, q) - print('#'*10) - print(next_state,f'\t {torch.max(predictions)}') - """ - return next_state - - def predict_values(self, current_state: AbstractNode, next_states: List[AbstractNode]) -> torch.Tensor: - """ - Predict promise of next states given current state. - - Returns: - Predicted Q values. - """ - # Instead it should be get embeddings ? - self.assign_embeddings(current_state) - assert len(next_states) > 0 - with torch.no_grad(): - self.heuristic_func.net.eval() - # create batch batch. - next_state_batch = [] - for _ in next_states: - self.assign_embeddings(_) - next_state_batch.append(_.embeddings) - next_state_batch = torch.cat(next_state_batch, dim=0) - x = PrepareBatchOfPrediction(current_state.embeddings, - next_state_batch, - self.emb_pos, - self.emb_neg).get_all() - predictions = self.heuristic_func.net.forward(x) - return predictions - - @staticmethod - def retrieve_concept_chain(rl_state: RL_State) -> List[RL_State]: - hierarchy = deque() - if rl_state.parent_node: - hierarchy.appendleft(rl_state.parent_node) - while hierarchy[-1].parent_node is not None: - hierarchy.append(hierarchy[-1].parent_node) - hierarchy.appendleft(rl_state) - return list(hierarchy) - - def train(self, dataset: Iterable[Tuple[str, Set, Set]], relearn_ratio: int = 2): - """ - Train RL agent on learning problems with relearn_ratio. - - Args: - dataset: An iterable containing training data. Each item corresponds to a tuple of string representation - of target concept, a set of positive examples in the form of URIs amd a set of negative examples in - the form of URIs, respectively. - relearn_ratio: An integer indicating the number of times dataset is iterated. - - Computation: - 1. Dataset and relearn_ratio loops: Learn each problem relearn_ratio times. - - 2. Learning loop. - - 3. Take post process action that implemented by subclass. - - Returns: - self. - """ - if self.verbose > 0: - logger.info(f'Training starts.\nNumber of learning problem:{len(dataset)},\t Relearn ratio:{relearn_ratio}') - counter = 1 - renderer = DLSyntaxObjectRenderer() - - # 1. - for _ in range(relearn_ratio): - for (target_owl_ce, positives, negatives) in dataset: - - if self.verbose > 0: - logger.info( - 'Goal Concept:{0}\tE^+:[{1}] \t E^-:[{2}]'.format(target_owl_ce, - len(positives), len(negatives))) - logger.info(f'RL training on {counter}.th learning problem starts') - - goal_path = list(reversed(self.retrieve_concept_chain(target_owl_ce))) - # goal_path: [⊤, Daughter, Daughter ⊓ Mother] - sum_of_rewards_per_actions = self.rl_learning_loop(pos_uri=positives, neg_uri=negatives, - goal_path=goal_path) - - if self.verbose > 2: - logger.info(f'Sum of Rewards in first 3 trajectory:{sum_of_rewards_per_actions[:3]}') - logger.info(f'Sum of Rewards in last 3 trajectory:{sum_of_rewards_per_actions[:3]}') - - self.seen_examples.setdefault(counter, dict()).update( - {'Concept': renderer.render(target_owl_ce.concept), - 'Positives': [i.get_iri().as_str() for i in positives], - 'Negatives': [i.get_iri().as_str() for i in negatives]}) - - counter += 1 - if counter % 100 == 0: - self.save_weights() - # 3. - return self.terminate_training() - - -class DrillHeuristic: - """ - Heuristic in Convolutional DQL concept learning. - Heuristic implements a convolutional neural network. - """ - - def __init__(self, pos=None, neg=None, model=None, mode=None, model_args=None): - if model: - self.net = model - elif mode in ['averaging', 'sampling']: - self.net = DrillNet(model_args) - self.mode = mode - self.name = 'DrillHeuristic_' + self.mode - else: - raise ValueError - self.net.eval() - - def score(self, node, parent_node=None): - """ Compute heuristic value of root node only""" - if parent_node is None and node.is_root: - return torch.FloatTensor([.0001]).squeeze() - raise ValueError - - def apply(self, node, parent_node=None): - """ Assign predicted Q-value to node object.""" - predicted_q_val = self.score(node, parent_node) - node.heuristic = predicted_q_val - - -class DrillNet(nn.Module): - """ - A neural model for Deep Q-Learning. - - An input Drill has the following form: - 1. Indexes of individuals belonging to current state (s). - 2. Indexes of individuals belonging to next state (s_prime). - 3. Indexes of individuals provided as positive examples. - 4. Indexes of individuals provided as negative examples. - - Given such input, we from a sparse 3D Tensor where each slice is a **** N *** by ***D*** - where N is the number of individuals and D is the number of dimension of embeddings. - Given that N on the current benchmark datasets < 10^3, we can get away with this computation. By doing so - we do not need to subsample from given inputs. - - """ - - def __init__(self, args): - super(DrillNet, self).__init__() - self.in_channels, self.embedding_dim = args['input_shape'] - assert self.embedding_dim - - self.loss = nn.MSELoss() - # Conv1D seems to be faster than Conv2d - self.conv1 = nn.Conv1d(in_channels=4, - out_channels=args['first_out_channels'], - kernel_size=args['kernel_size'], - padding=1, stride=1, bias=True) - - # Fully connected layers. - self.size_of_fc1 = int(args['first_out_channels'] * self.embedding_dim) - self.fc1 = nn.Linear(in_features=self.size_of_fc1, out_features=self.size_of_fc1 // 2) - self.fc2 = nn.Linear(in_features=self.size_of_fc1 // 2, out_features=1) - - self.init() - - def init(self): - xavier_normal_(self.fc1.weight.data) - xavier_normal_(self.conv1.weight.data) - - def forward(self, X: torch.FloatTensor): - """ - X n by 4 by d float tensor - """ - # N x 32 x D - X = F.relu(self.conv1(X)) - X = X.flatten(start_dim=1) - # N x (32D/2) - X = F.relu(self.fc1(X)) - # N x 1 - scores = self.fc2(X).flatten() - return scores class EvoLearner(BaseConceptLearner[EvoLearnerNode]): diff --git a/ontolearn/learners/__init__.py b/ontolearn/learners/__init__.py new file mode 100644 index 00000000..4cb52422 --- /dev/null +++ b/ontolearn/learners/__init__.py @@ -0,0 +1 @@ +from .drill import Drill \ No newline at end of file diff --git a/ontolearn/learners/drill.py b/ontolearn/learners/drill.py new file mode 100644 index 00000000..2aab9533 --- /dev/null +++ b/ontolearn/learners/drill.py @@ -0,0 +1,838 @@ +from ontolearn.base_concept_learner import RefinementBasedConceptLearner +from ontolearn.refinement_operators import LengthBasedRefinement +from ontolearn.abstracts import AbstractScorer, AbstractNode +from ontolearn.search import RL_State +from typing import Set, List, Tuple, Optional, Generator, SupportsFloat, Iterable +from owlapy.model import OWLNamedIndividual, OWLClassExpression +from ontolearn.learning_problem import PosNegLPStandard, EncodedPosNegLPStandard +import torch +from ontolearn.data_struct import Experience +from ontolearn.search import DRILLSearchTreePriorityQueue +from ontolearn.utils import create_experiment_folder +from collections import Counter +from itertools import chain +import time + +class Drill(RefinementBasedConceptLearner): + """Deep Reinforcement Learning for Refinement Operators in ALC.""" + + def __init__(self, knowledge_base, + path_of_embeddings: str = None, refinement_operator: LengthBasedRefinement = None, + use_inverse=True, + use_data_properties=True, + use_card_restrictions=True, + card_limit=10, + quality_func: AbstractScorer = None, + reward_func=None, + batch_size=None, num_workers=None, pretrained_model_name=None, + iter_bound=None, max_num_of_concepts_tested=None, verbose=None, terminate_on_goal=None, + max_len_replay_memory=None, epsilon_decay=None, epsilon_min=None, num_epochs_per_replay=None, + num_episodes_per_replay=None, learning_rate=None, max_runtime=None, num_of_sequential_actions=None, + num_episode=None): + + print("***DRILL has not yet been fully integrated***") + self.name = "DRILL" + # TODO: Clear difference between training and testing should be defined at init + if refinement_operator is None: + refinement_operator = LengthBasedRefinement(knowledge_base=knowledge_base, + use_data_properties=use_data_properties, + use_card_restrictions=use_card_restrictions, + card_limit=card_limit, + use_inverse=use_inverse) + self.reward_func = reward_func + self.representation_mode = "averaging" + self.heuristic_func = None + self.num_workers = num_workers + self.epsilon = 1 + self.learning_rate = .001 + self.num_episode = 1 + self.num_of_sequential_actions = 3 + self.num_epochs_per_replay = 1 + self.max_len_replay_memory = 256 + self.epsilon_decay = 0.01 + self.epsilon_min = 0 + self.batch_size = 1024 + self.verbose = 0 + self.num_episodes_per_replay = 2 + self.seen_examples = dict() + self.emb_pos, self.emb_neg = None, None + self.start_time = None + self.goal_found = False + self.experiences = Experience(maxlen=self.max_len_replay_memory) + + if path_of_embeddings is not None and os.path.exists(path_of_embeddings): + self.instance_embeddings = pd.read_csv(path_of_embeddings) + self.embedding_dim = self.instance_embeddings.shape[1] + else: + self.instance_embeddings = None + self.embedding_dim = 12 + + self.sample_size = 1 + arg_net = {'input_shape': (4 * self.sample_size, self.embedding_dim), + 'first_out_channels': 32, 'second_out_channels': 16, 'third_out_channels': 8, + 'kernel_size': 3} + self.heuristic_func = DrillHeuristic(mode='averaging', model_args=arg_net) + if self.learning_rate: + self.optimizer = torch.optim.Adam(self.heuristic_func.net.parameters(), lr=self.learning_rate) + + if pretrained_model_name: + self.pre_trained_model_loaded = True + self.heuristic_func.net.load_state_dict(torch.load(pretrained_model_name, torch.device('cpu'))) + else: + self.pre_trained_model_loaded = False + + RefinementBasedConceptLearner.__init__(self, knowledge_base=knowledge_base, + refinement_operator=refinement_operator, + quality_func=quality_func, + heuristic_func=self.heuristic_func, + terminate_on_goal=terminate_on_goal, + iter_bound=iter_bound, + max_num_of_concepts_tested=max_num_of_concepts_tested, + max_runtime=max_runtime) + print('Number of parameters: ', sum([p.numel() for p in self.heuristic_func.net.parameters()])) + + self.search_tree = DRILLSearchTreePriorityQueue() + self._learning_problem = None + self.storage_path, _ = create_experiment_folder() + + def best_hypotheses(self, n=1): + assert self.search_tree is not None + assert len(self.search_tree) > 1 + if n == 1: + return [i for i in self.search_tree.get_top_n_nodes(n)][0] + else: + return [i for i in self.search_tree.get_top_n_nodes(n)] + + def clean(self): + self.emb_pos, self.emb_neg = None, None + self.goal_found = False + self.start_time = None + if len(self.search_tree) != 0: + self.search_tree.clean() + + try: + assert len(self.search_tree) == 0 + except AssertionError: + print(len(self.search_tree)) + raise AssertionError('EMPTY search tree') + + self._number_of_tested_concepts = 0 + + def downward_refinement(self, *args, **kwargs): + ValueError('downward_refinement') + + def next_node_to_expand(self) -> RL_State: + """ Return a node that maximizes the heuristic function at time t. """ + return self.search_tree.get_most_promising() + + def initialize_class_expression_learning_problem(self, pos: Set[OWLNamedIndividual], neg: Set[OWLNamedIndividual]): + """ + Determine the learning problem and initialize the search. + 1) Convert the string representation of an individuals into the owlready2 representation. + 2) Sample negative examples if necessary. + 3) Initialize the root and search tree. + """ + self.clean() + assert 0 < len(pos) and 0 < len(neg) + + # 1. + # Generate a Learning Problem + self._learning_problem = PosNegLPStandard(pos=set(pos), neg=set(neg)).encode_kb(self.kb) + # 2. Obtain embeddings of positive and negative examples. + if self.instance_embeddings is None: + self.emb_pos = None + self.emb_neg = None + else: + self.emb_pos = torch.tensor( + self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos]].values, + dtype=torch.float32) + self.emb_neg = torch.tensor( + self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg]].values, + dtype=torch.float32) + + # (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini batching. + self.emb_pos = torch.mean(self.emb_pos, dim=0) + self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) + self.emb_neg = torch.mean(self.emb_neg, dim=0) + self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) + # Sanity checking + if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): + raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) + if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): + raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) + + # Initialize ROOT STATE + root_rl_state = self.create_rl_state(self.start_class, is_root=True) + self.compute_quality_of_class_expression(root_rl_state) + return root_rl_state + + def fit(self, lp: PosNegLPStandard, max_runtime=None): + if max_runtime: + assert isinstance(max_runtime, int) + self.max_runtime = max_runtime + # @TODO: Type injection should be possible for all + pos_type_counts = Counter( + [i for i in chain.from_iterable((self.kb.get_types(ind, direct=True) for ind in lp.pos))]) + neg_type_counts = Counter( + [i for i in chain.from_iterable((self.kb.get_types(ind, direct=True) for ind in lp.neg))]) + type_bias = pos_type_counts - neg_type_counts + + # (1) Initialize learning problem + root_state = self.initialize_class_expression_learning_problem(pos=lp.pos, neg=lp.neg) + # (2) Add root state into search tree + root_state.heuristic = root_state.quality + self.search_tree.add(root_state) + # (3) Inject Type Bias + for x in (self.create_rl_state(i, parent_node=root_state) for i in type_bias): + self.compute_quality_of_class_expression(x) + x.heuristic = x.quality + self.search_tree.add(x) + self.start_time = time.time() + # (3) Search + for i in range(1, self.iter_bound): + # (1) Get the most fitting RL-state + most_promising = self.next_node_to_expand() + next_possible_states = [] + # (2) Refine (1) + for ref in self.apply_refinement(most_promising): + # (2.1) If the next possible RL-state is not a dead end + # (2.1.) If the refinement of (1) is not equivalent to \bottom + if len(ref.instances): + # Compute quality + self.compute_quality_of_class_expression(ref) + if ref.quality == 0: + continue + next_possible_states.append(ref) + if ref.quality == 1.0: + break + try: + assert len(next_possible_states) > 0 + except AssertionError: + if self.verbose > 1: + logger.info(f'DEAD END at {most_promising}') + continue + if len(next_possible_states) == 0: + # We do not need to compute Q value based on embeddings of "zeros". + continue + + if self.pre_trained_model_loaded is True: + preds = self.predict_values(current_state=most_promising, next_states=next_possible_states) + else: + preds = None + self.goal_found = self.update_search(next_possible_states, preds) + if self.goal_found: + if self.terminate_on_goal: + return self.terminate() + if time.time() - self.start_time > self.max_runtime: + return self.terminate() + + def show_search_tree(self, heading_step: str, top_n: int = 10) -> None: + ValueError('show_search_tree') + + def terminate_training(self): + ValueError('terminate_training') + + def fit_from_iterable(self, + dataset: List[Tuple[object, Set[OWLNamedIndividual], Set[OWLNamedIndividual]]], + max_runtime: int = None) -> List: + """ + Dataset is a list of tuples where the first item is either str or OWL class expression indicating target + concept. + """ + if max_runtime: + self.max_runtime = max_runtime + renderer = DLSyntaxObjectRenderer() + + results = [] + for (target_ce, p, n) in dataset: + if self.verbose > 0: + logger.info(f'TARGET OWL CLASS EXPRESSION:\n{target_ce}') + logger.info(f'|Sampled Positive|:{len(p)}\t|Sampled Negative|:{len(n)}') + start_time = time.time() + self.fit(pos=p, neg=n, max_runtime=max_runtime) + rn = time.time() - start_time + h: RL_State = next(iter(self.best_hypotheses())) + # TODO:CD: We need to remove this first returned boolean for the sake of readability. + _, f_measure = F1().score_elp(instances=h.instances_bitset, learning_problem=self._learning_problem) + _, accuracy = Accuracy().score_elp(instances=h.instances_bitset, learning_problem=self._learning_problem) + + report = {'Target': str(target_ce), + 'Prediction': renderer.render(h.concept), + 'F-measure': f_measure, + 'Accuracy': accuracy, + 'NumClassTested': self._number_of_tested_concepts, + 'Runtime': rn} + results.append(report) + + return results + + def init_training(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual]) -> None: + """ + Initialize training. + """ + """ (1) Generate a Learning Problem """ + self._learning_problem = PosNegLPStandard(pos=pos_uri, neg=neg_uri).encode_kb(self.kb) + """ (2) Update REWARD FUNC FOR each learning problem """ + self.reward_func.lp = self._learning_problem + """ (3) Obtain embeddings of positive and negative examples """ + self.emb_pos = torch.tensor( + self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos_uri]].values, + dtype=torch.float32) + self.emb_neg = torch.tensor( + self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg_uri]].values, + dtype=torch.float32) + """ (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini + batching """ + self.emb_pos = torch.mean(self.emb_pos, dim=0) + self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) + self.emb_neg = torch.mean(self.emb_neg, dim=0) + self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) + # Sanity checking + if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): + raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) + if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): + raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) + + # Default exploration exploitation tradeoff. + """ (3) Default exploration exploitation tradeoff and number of expression tested """ + self.epsilon = 1 + self._number_of_tested_concepts = 0 + + def create_rl_state(self, c: OWLClassExpression, parent_node: Optional[RL_State] = None, + is_root: bool = False) -> RL_State: + """ Create an RL_State instance.""" + # Create State + rl_state = RL_State(c, parent_node=parent_node, is_root=is_root) + # Assign Embeddings to it. Later, assign_embeddings can be also done in RL_STATE + self.assign_embeddings(rl_state) + rl_state.length = self.kb.concept_len(c) + return rl_state + + def compute_quality_of_class_expression(self, state: RL_State) -> None: + """ Compute Quality of owl class expression.""" + self.quality_func.apply(state, state.instances_bitset, self._learning_problem) + self._number_of_tested_concepts += 1 + + def apply_refinement(self, rl_state: RL_State) -> Generator: + """ + Refine an OWL Class expression \\|= Observing next possible states. + + 1. Generate concepts by refining a node. + 1.1. Compute allowed length of refinements. + 1.2. Convert concepts if concepts do not belong to self.concepts_to_ignore. + Note that i.str not in self.concepts_to_ignore => O(1) if a set is being used. + 3. Return Generator. + """ + assert isinstance(rl_state, RL_State) + # 1. + for i in self.operator.refine(rl_state.concept): # O(N) + yield self.create_rl_state(i, parent_node=rl_state) + + def learn_from_illustration(self, sequence_of_goal_path: List[RL_State]): + """ + Args: + sequence_of_goal_path: ⊤,Parent,Parent ⊓ Daughter. + """ + current_state = sequence_of_goal_path.pop(0) + rewards = [] + sequence_of_states = [] + while len(sequence_of_goal_path) > 0: + self.assign_embeddings(current_state) + current_state.length = self.kb.concept_len(current_state.concept) + if current_state.quality is None: + self.compute_quality_of_class_expression(current_state) + + next_state = sequence_of_goal_path.pop(0) + self.assign_embeddings(next_state) + next_state.length = self.kb.concept_len(next_state.concept) + if next_state.quality is None: + self.compute_quality_of_class_expression(next_state) + sequence_of_states.append((current_state, next_state)) + rewards.append(self.reward_func.apply(current_state, next_state)) + for x in range(2): + self.form_experiences(sequence_of_states, rewards) + self.learn_from_replay_memory() + + def rl_learning_loop(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual], + goal_path: List[RL_State] = None) -> List[float]: + """ + Standard RL training loop. + + 1. Initialize RL environment for training. + + 2. Learn from an illustration if possible. + 2. Training Loop. + """ + """ (1) Initialize RL environment for training """ + self.init_training(pos_uri=pos_uri, neg_uri=neg_uri) + root_rl_state = self.create_rl_state(self.start_class, is_root=True) + self.compute_quality_of_class_expression(root_rl_state) + sum_of_rewards_per_actions = [] + log_every_n_episodes = int(self.num_episode * .1) + 1 + """ (2) Learn from an illustration if possible """ + if goal_path: + self.learn_from_illustration(goal_path) + + """ (3) Reinforcement Learning offline training loop """ + for th in range(self.num_episode): + """ (3.1) Sequence of decisions """ + sequence_of_states, rewards = self.sequence_of_actions(root_rl_state) + + if self.verbose >= 10: + logger.info('#' * 10, end='') + logger.info(f'{th}\t.th Sequence of Actions', end='') + logger.info('#' * 10) + for step, (current_state, next_state) in enumerate(sequence_of_states): + logger.info(f'{step}. Transition \n{current_state}\n----->\n{next_state}') + logger.info(f'Reward:{rewards[step]}') + + if th % log_every_n_episodes == 0: + if self.verbose >= 1: + logger.info('{0}.th iter. SumOfRewards: {1:.2f}\t' + 'Epsilon:{2:.2f}\t' + '|ReplayMem.|:{3}'.format(th, sum(rewards), + self.epsilon, + len(self.experiences))) + """(3.2) Form experiences""" + self.form_experiences(sequence_of_states, rewards) + sum_of_rewards_per_actions.append(sum(rewards)) + """(3.2) Learn from experiences""" + if th % self.num_episodes_per_replay == 0: + self.learn_from_replay_memory() + """(3.4) Exploration Exploitation""" + if self.epsilon < 0: + break + self.epsilon -= self.epsilon_decay + + return sum_of_rewards_per_actions + + def sequence_of_actions(self, root_rl_state: RL_State) -> Tuple[List[Tuple[AbstractNode, AbstractNode]], + List[SupportsFloat]]: + assert isinstance(root_rl_state, RL_State) + + current_state = root_rl_state + path_of_concepts = [] + rewards = [] + + assert len(current_state.embeddings) > 0 # Embeddings are initialized + assert current_state.quality > 0 + assert current_state.heuristic is None + + # (1) + for _ in range(self.num_of_sequential_actions): + assert isinstance(current_state, RL_State) + # (1.1) Observe Next RL states, i.e., refine an OWL class expression + next_rl_states = list(self.apply_refinement(current_state)) + # (1.2) + if len(next_rl_states) == 0: # DEAD END + # assert (current_state.length + 3) <= self.max_child_length + print('No next state') + break + # (1.3) + next_selected_rl_state = self.exploration_exploitation_tradeoff(current_state, next_rl_states) + # (1.4) Remember the concept path + path_of_concepts.append((current_state, next_selected_rl_state)) + # (1.5) + rewards.append(self.reward_func.apply(current_state, next_selected_rl_state)) + # (1.6) + current_state = next_selected_rl_state + return path_of_concepts, rewards + + def form_experiences(self, state_pairs: List, rewards: List) -> None: + """ + Form experiences from a sequence of concepts and corresponding rewards. + + state_pairs - A list of tuples containing two consecutive states. + reward - A list of reward. + + Gamma is 1. + + Return + X - A list of embeddings of current concept, next concept, positive examples, negative examples. + y - Argmax Q value. + """ + + if self.verbose > 1: + print('Form Experiences for the training') + + for th, consecutive_states in enumerate(state_pairs): + e, e_next = consecutive_states + self.experiences.append( + (e, e_next, max(rewards[th:]))) # given e, e_next, Q val is the max Q value reachable. + + def learn_from_replay_memory(self) -> None: + """ + Learning by replaying memory. + """ + if self.verbose > 1: + print('Learn from Experience') + + current_state_batch, next_state_batch, q_values = self.experiences.retrieve() + current_state_batch = torch.cat(current_state_batch, dim=0) + next_state_batch = torch.cat(next_state_batch, dim=0) + q_values = torch.Tensor(q_values) + + try: + assert current_state_batch.shape[1] == next_state_batch.shape[1] == self.emb_pos.shape[1] == \ + self.emb_neg.shape[1] + + except AssertionError as e: + print(current_state_batch.shape) + print(next_state_batch.shape) + print(self.emb_pos.shape) + print(self.emb_neg.shape) + print('Wrong format.') + print(e) + raise + + assert current_state_batch.shape[2] == next_state_batch.shape[2] == self.emb_pos.shape[2] == self.emb_neg.shape[ + 2] + dataset = PrepareBatchOfTraining(current_state_batch=current_state_batch, + next_state_batch=next_state_batch, + p=self.emb_pos, n=self.emb_neg, q=q_values) + num_experience = len(dataset) + data_loader = torch.utils.data.DataLoader(dataset, + batch_size=self.batch_size, shuffle=True, + num_workers=self.num_workers) + if self.verbose > 1: + print(f'Number of experiences:{num_experience}') + print('DQL agent is learning via experience replay') + self.heuristic_func.net.train() + for m in range(self.num_epochs_per_replay): + total_loss = 0 + for X, y in data_loader: + self.optimizer.zero_grad() # zero the gradient buffers + # forward + predicted_q = self.heuristic_func.net.forward(X) + # loss + loss = self.heuristic_func.net.loss(predicted_q, y) + total_loss += loss.item() + # compute the derivative of the loss w.r.t. the parameters using backpropagation + loss.backward() + # clip gradients if gradients are killed. =>torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) + self.optimizer.step() + if self.verbose > 1: + print(f'{m}.th Epoch average loss during training:{total_loss / num_experience}') + + self.heuristic_func.net.train().eval() + + def update_search(self, concepts, predicted_Q_values=None): + """ + @param concepts: + @param predicted_Q_values: + @return: + """ + if predicted_Q_values is not None: + for child_node, pred_Q in zip(concepts, predicted_Q_values): + child_node.heuristic = pred_Q + if child_node.quality > 0: # > too weak, ignore. + self.search_tree.add(child_node) + if child_node.quality == 1: + return child_node + else: + for child_node in concepts: + child_node.heuristic = child_node.quality + if child_node.quality > 0: # > too weak, ignore. + self.search_tree.add(child_node) + if child_node.quality == 1: + return child_node + + def assign_embeddings(self, rl_state: RL_State) -> None: + """ + Assign embeddings to a rl state. A rl state is represented with vector representation of + all individuals belonging to a respective OWLClassExpression. + """ + assert isinstance(rl_state, RL_State) + # (1) Detect mode of representing OWLClassExpression + if self.representation_mode == 'averaging': + # (2) if input node has not seen before, assign embeddings. + if rl_state.embeddings is None: + assert isinstance(rl_state.concept, OWLClassExpression) + # (3) Retrieval instances via our retrieval function (R(C)). Be aware Open World and Closed World + # Assumption + rl_state.instances = set(self.kb.individuals(rl_state.concept)) + # (4) Retrieval instances in terms of bitset. + rl_state.instances_bitset = self.kb.individuals_set(rl_state.concept) + # (5) |R(C)|=\emptyset ? + if len(rl_state.instances) == 0: + # If|R(C)|=\emptyset, then represent C with zeros + if self.instance_embeddings is not None: + emb = torch.zeros(1, self.sample_size, self.instance_embeddings.shape[1]) + else: + emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) + else: + # If|R(C)| \not= \emptyset, then take the mean of individuals. + str_idx = [i.get_iri().as_str() for i in rl_state.instances] + assert len(str_idx) > 0 + if self.instance_embeddings is not None: + emb = torch.tensor(self.instance_embeddings.loc[str_idx].values, dtype=torch.float32) + emb = torch.mean(emb, dim=0) + emb = emb.view(1, self.sample_size, self.instance_embeddings.shape[1]) + else: + emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) + # (6) Assign embeddings + rl_state.embeddings = emb + else: + """ Embeddings already assigned.""" + try: + assert rl_state.embeddings.shape == (1, self.sample_size, self.embedding_dim) + except AssertionError as e: + print(e) + print(rl_state) + print(rl_state.embeddings.shape) + print((1, self.sample_size, self.instance_embeddings.shape[1])) + raise + elif self.representation_mode == 'sampling': + raise NotImplementedError('Sampling technique for state representation is not implemented.') + """ + if node.embeddings is None: + str_idx = [get_full_iri(i).replace('\n', '') for i in node.concept.instances] + if len(str_idx) >= self.sample_size: + sampled_str_idx = random.sample(str_idx, self.sample_size) + emb = torch.tensor(self.instance_embeddings.loc[sampled_str_idx].values, dtype=torch.float32) + else: + num_rows_to_fill = self.sample_size - len(str_idx) + emb = torch.tensor(self.instance_embeddings.loc[str_idx].values, dtype=torch.float32) + emb = torch.cat((torch.zeros(num_rows_to_fill, self.instance_embeddings.shape[1]), emb)) + emb = emb.view(1, self.sample_size, self.instance_embeddings.shape[1]) + node.embeddings = emb + else: + try: + assert node.embeddings.shape == (1, self.sample_size, self.instance_embeddings.shape[1]) + except AssertionError: + print(node) + print(self.sample_size) + print(node.embeddings.shape) + print((1, self.sample_size, self.instance_embeddings.shape[1])) + raise ValueError + """ + else: + raise ValueError + + # @todo remove this testing in experiments. + if torch.isnan(rl_state.embeddings).any() or torch.isinf(rl_state.embeddings).any(): + # No individual contained in the input concept. + # Sanity checking. + raise ValueError + + def save_weights(self): + """ + Save pytorch weights. + """ + # Save model. + torch.save(self.heuristic_func.net.state_dict(), + self.storage_path + '/{0}.pth'.format(self.heuristic_func.name)) + + def exploration_exploitation_tradeoff(self, current_state: AbstractNode, + next_states: List[AbstractNode]) -> AbstractNode: + """ + Exploration vs Exploitation tradeoff at finding next state. + (1) Exploration. + (2) Exploitation. + """ + if np.random.random() < self.epsilon: + next_state = random.choice(next_states) + self.assign_embeddings(next_state) + else: + next_state = self.exploitation(current_state, next_states) + self.compute_quality_of_class_expression(next_state) + return next_state + + def exploitation(self, current_state: AbstractNode, next_states: List[AbstractNode]) -> AbstractNode: + """ + Find next node that is assigned with highest predicted Q value. + + (1) Predict Q values : predictions.shape => torch.Size([n, 1]) where n = len(next_states). + + (2) Find the index of max value in predictions. + + (3) Use the index to obtain next state. + + (4) Return next state. + """ + predictions: torch.Tensor = self.predict_values(current_state, next_states) + argmax_id = int(torch.argmax(predictions)) + next_state = next_states[argmax_id] + """ + # Sanity checking + print('#'*10) + for s, q in zip(next_states, predictions): + print(s, q) + print('#'*10) + print(next_state,f'\t {torch.max(predictions)}') + """ + return next_state + + def predict_values(self, current_state: AbstractNode, next_states: List[AbstractNode]) -> torch.Tensor: + """ + Predict promise of next states given current state. + + Returns: + Predicted Q values. + """ + # Instead it should be get embeddings ? + self.assign_embeddings(current_state) + assert len(next_states) > 0 + with torch.no_grad(): + self.heuristic_func.net.eval() + # create batch batch. + next_state_batch = [] + for _ in next_states: + self.assign_embeddings(_) + next_state_batch.append(_.embeddings) + next_state_batch = torch.cat(next_state_batch, dim=0) + x = PrepareBatchOfPrediction(current_state.embeddings, + next_state_batch, + self.emb_pos, + self.emb_neg).get_all() + predictions = self.heuristic_func.net.forward(x) + return predictions + + @staticmethod + def retrieve_concept_chain(rl_state: RL_State) -> List[RL_State]: + hierarchy = deque() + if rl_state.parent_node: + hierarchy.appendleft(rl_state.parent_node) + while hierarchy[-1].parent_node is not None: + hierarchy.append(hierarchy[-1].parent_node) + hierarchy.appendleft(rl_state) + return list(hierarchy) + + def train(self, dataset: Iterable[Tuple[str, Set, Set]], relearn_ratio: int = 2): + """ + Train RL agent on learning problems with relearn_ratio. + + Args: + dataset: An iterable containing training data. Each item corresponds to a tuple of string representation + of target concept, a set of positive examples in the form of URIs amd a set of negative examples in + the form of URIs, respectively. + relearn_ratio: An integer indicating the number of times dataset is iterated. + + Computation: + 1. Dataset and relearn_ratio loops: Learn each problem relearn_ratio times. + + 2. Learning loop. + + 3. Take post process action that implemented by subclass. + + Returns: + self. + """ + if self.verbose > 0: + logger.info(f'Training starts.\nNumber of learning problem:{len(dataset)},\t Relearn ratio:{relearn_ratio}') + counter = 1 + renderer = DLSyntaxObjectRenderer() + + # 1. + for _ in range(relearn_ratio): + for (target_owl_ce, positives, negatives) in dataset: + + if self.verbose > 0: + logger.info( + 'Goal Concept:{0}\tE^+:[{1}] \t E^-:[{2}]'.format(target_owl_ce, + len(positives), len(negatives))) + logger.info(f'RL training on {counter}.th learning problem starts') + + goal_path = list(reversed(self.retrieve_concept_chain(target_owl_ce))) + # goal_path: [⊤, Daughter, Daughter ⊓ Mother] + sum_of_rewards_per_actions = self.rl_learning_loop(pos_uri=positives, neg_uri=negatives, + goal_path=goal_path) + + if self.verbose > 2: + logger.info(f'Sum of Rewards in first 3 trajectory:{sum_of_rewards_per_actions[:3]}') + logger.info(f'Sum of Rewards in last 3 trajectory:{sum_of_rewards_per_actions[:3]}') + + self.seen_examples.setdefault(counter, dict()).update( + {'Concept': renderer.render(target_owl_ce.concept), + 'Positives': [i.get_iri().as_str() for i in positives], + 'Negatives': [i.get_iri().as_str() for i in negatives]}) + + counter += 1 + if counter % 100 == 0: + self.save_weights() + # 3. + return self.terminate_training() + + +class DrillHeuristic: + """ + Heuristic in Convolutional DQL concept learning. + Heuristic implements a convolutional neural network. + """ + + def __init__(self, pos=None, neg=None, model=None, mode=None, model_args=None): + if model: + self.net = model + elif mode in ['averaging', 'sampling']: + self.net = DrillNet(model_args) + self.mode = mode + self.name = 'DrillHeuristic_' + self.mode + else: + raise ValueError + self.net.eval() + + def score(self, node, parent_node=None): + """ Compute heuristic value of root node only""" + if parent_node is None and node.is_root: + return torch.FloatTensor([.0001]).squeeze() + raise ValueError + + def apply(self, node, parent_node=None): + """ Assign predicted Q-value to node object.""" + predicted_q_val = self.score(node, parent_node) + node.heuristic = predicted_q_val + + +class DrillNet(torch.nn.Module): + """ + A neural model for Deep Q-Learning. + + An input Drill has the following form: + 1. Indexes of individuals belonging to current state (s). + 2. Indexes of individuals belonging to next state (s_prime). + 3. Indexes of individuals provided as positive examples. + 4. Indexes of individuals provided as negative examples. + + Given such input, we from a sparse 3D Tensor where each slice is a **** N *** by ***D*** + where N is the number of individuals and D is the number of dimension of embeddings. + Given that N on the current benchmark datasets < 10^3, we can get away with this computation. By doing so + we do not need to subsample from given inputs. + + """ + + def __init__(self, args): + super(DrillNet, self).__init__() + self.in_channels, self.embedding_dim = args['input_shape'] + assert self.embedding_dim + + self.loss = torch.nn.MSELoss() + # Conv1D seems to be faster than Conv2d + self.conv1 = torch.nn.Conv1d(in_channels=4, + out_channels=args['first_out_channels'], + kernel_size=args['kernel_size'], + padding=1, stride=1, bias=True) + + # Fully connected layers. + self.size_of_fc1 = int(args['first_out_channels'] * self.embedding_dim) + self.fc1 = torch.nn.Linear(in_features=self.size_of_fc1, out_features=self.size_of_fc1 // 2) + self.fc2 = torch.nn.Linear(in_features=self.size_of_fc1 // 2, out_features=1) + + self.init() + + def init(self): + torch.nn.init.xavier_normal_(self.fc1.weight.data) + torch.nn.init.xavier_normal_(self.conv1.weight.data) + + def forward(self, X: torch.FloatTensor): + """ + X n by 4 by d float tensor + """ + # N x 32 x D + X = F.relu(self.conv1(X)) + X = X.flatten(start_dim=1) + # N x (32D/2) + X = F.relu(self.fc1(X)) + # N x 1 + scores = self.fc2(X).flatten() + return scores + diff --git a/ontolearn/learners/nero.py b/ontolearn/learners/nero.py new file mode 100644 index 00000000..f491cc64 --- /dev/null +++ b/ontolearn/learners/nero.py @@ -0,0 +1,9 @@ +class NERO: + def __init__(self): + pass + + def train(self): + pass + + def fit(self): + pass From 10ffa91c9dc2f4215a57bcd2c30bb08a5bcd4406 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 17:11:37 +0100 Subject: [PATCH 09/31] Version of ontolearn is incremented --- ontolearn/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ontolearn/__init__.py b/ontolearn/__init__.py index bfb401a7..3cac406a 100644 --- a/ontolearn/__init__.py +++ b/ontolearn/__init__.py @@ -7,7 +7,7 @@ Author: The Ontolearn team """ -__version__ = '0.6.1' +__version__ = '0.6.2' # TODO: Importing decision required rethinking # from .knowledge_base import KnowledgeBase diff --git a/setup.py b/setup.py index 81cf7685..1fe451d4 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ontolearn", description="Ontolearn is an open-source software library for structured machine learning in Python. Ontolearn includes modules for processing knowledge bases, inductive logic programming and ontology engineering.", - version="0.6.1", + version="0.6.2", packages=find_packages(), install_requires=[ "scikit-learn>=0.24.1", From cb704a78a0365d2b0602f9af3d1717dd21d2074a Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 19:38:50 +0100 Subject: [PATCH 10/31] Regression test between learners added, dicee integrated, ontolearn/learners added --- README.md | 2 +- examples/concept_learning_evaluation.py | 1 + ontolearn/learners/drill.py | 75 ++++++++++++++----------- requirements.txt | 3 +- setup.py | 3 +- tests/test_learners_regression.py | 49 ++++++++++++++++ 6 files changed, 98 insertions(+), 35 deletions(-) create mode 100644 tests/test_learners_regression.py diff --git a/README.md b/README.md index 4cc801e7..6a75ab71 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install ontolearn ``` or ```shell -git clone https://github.com/dice-group/Ontolearn.git && conda create --name onto python=3.8.18 && conda activate onto +git clone https://github.com/dice-group/Ontolearn.git && conda create --name onto python=3.9.18 && conda activate onto pip3 install -e . && python -c "import ontolearn" wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip ``` diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index 0b465ff3..cf5efadb 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -80,5 +80,6 @@ def dl_concept_learning(args): parser.add_argument("--max_runtime", type=int, default=10) parser.add_argument("--lps", type=str, default="synthetic_problems.json") parser.add_argument("--kb", type=str, default="../KGs/Family/family-benchmark_rich_background.owl") + parser.add_argument("--path_pretrained_kge", type=str, default="../KeciFamilyRun") dl_concept_learning(parser.parse_args()) diff --git a/ontolearn/learners/drill.py b/ontolearn/learners/drill.py index 2aab9533..9087d516 100644 --- a/ontolearn/learners/drill.py +++ b/ontolearn/learners/drill.py @@ -12,12 +12,23 @@ from collections import Counter from itertools import chain import time +import dicee +import os + class Drill(RefinementBasedConceptLearner): - """Deep Reinforcement Learning for Refinement Operators in ALC.""" + """ Neuro-Symbolic Class Expression Learning (https://www.ijcai.org/proceedings/2023/0403.pdf) + dice embeddings ? + pip3 install dicee + dicee --path_single_kg KGs/Family/family-benchmark_rich_background.owl --backend rdflib --model Keci --embedding_dim 32 --num_epochs 100 --path_to_store_single_run KeciFamilyRun + + + """ def __init__(self, knowledge_base, - path_of_embeddings: str = None, refinement_operator: LengthBasedRefinement = None, + path_pretrained_kge: str = None, + path_pretrained_drill: str = None, + refinement_operator: LengthBasedRefinement = None, use_inverse=True, use_data_properties=True, use_card_restrictions=True, @@ -32,7 +43,18 @@ def __init__(self, knowledge_base, print("***DRILL has not yet been fully integrated***") self.name = "DRILL" - # TODO: Clear difference between training and testing should be defined at init + if path_pretrained_kge is not None and os.path.isdir(path_pretrained_kge): + self.pre_trained_kge = dicee.KGE(path=path_pretrained_kge) + self.embedding_dim = self.pre_trained_kge.configs["embedding_dim"] + else: + self.pre_trained_kge = None + self.embedding_dim = 32 + + if path_pretrained_drill is not None and os.path.isdir(path_pretrained_drill): + raise NotImplementedError() + else: + self.pre_trained_drill = None + if refinement_operator is None: refinement_operator = LengthBasedRefinement(knowledge_base=knowledge_base, use_data_properties=use_data_properties, @@ -60,18 +82,12 @@ def __init__(self, knowledge_base, self.goal_found = False self.experiences = Experience(maxlen=self.max_len_replay_memory) - if path_of_embeddings is not None and os.path.exists(path_of_embeddings): - self.instance_embeddings = pd.read_csv(path_of_embeddings) - self.embedding_dim = self.instance_embeddings.shape[1] - else: - self.instance_embeddings = None - self.embedding_dim = 12 - self.sample_size = 1 - arg_net = {'input_shape': (4 * self.sample_size, self.embedding_dim), - 'first_out_channels': 32, 'second_out_channels': 16, 'third_out_channels': 8, - 'kernel_size': 3} - self.heuristic_func = DrillHeuristic(mode='averaging', model_args=arg_net) + self.heuristic_func = DrillHeuristic(mode=self.representation_mode, + model_args={'input_shape': (4 * self.sample_size, self.embedding_dim), + 'first_out_channels': 32, + 'second_out_channels': 16, 'third_out_channels': 8, + 'kernel_size': 3}) if self.learning_rate: self.optimizer = torch.optim.Adam(self.heuristic_func.net.parameters(), lr=self.learning_rate) @@ -139,16 +155,12 @@ def initialize_class_expression_learning_problem(self, pos: Set[OWLNamedIndividu # Generate a Learning Problem self._learning_problem = PosNegLPStandard(pos=set(pos), neg=set(neg)).encode_kb(self.kb) # 2. Obtain embeddings of positive and negative examples. - if self.instance_embeddings is None: + if self.pre_trained_kge is None: self.emb_pos = None self.emb_neg = None else: - self.emb_pos = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos]].values, - dtype=torch.float32) - self.emb_neg = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg]].values, - dtype=torch.float32) + self.emb_pos = self.pre_trained_kge.get_entity_embeddings([owl_indv.get_iri().as_str() for owl_indv in pos]) + self.emb_neg = self.pre_trained_kge.get_entity_embeddings([owl_indv.get_iri().as_str() for owl_indv in neg]) # (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini batching. self.emb_pos = torch.mean(self.emb_pos, dim=0) @@ -556,18 +568,18 @@ def assign_embeddings(self, rl_state: RL_State) -> None: # (5) |R(C)|=\emptyset ? if len(rl_state.instances) == 0: # If|R(C)|=\emptyset, then represent C with zeros - if self.instance_embeddings is not None: - emb = torch.zeros(1, self.sample_size, self.instance_embeddings.shape[1]) + if self.pre_trained_kge is not None: + emb = torch.zeros(1, self.sample_size, self.embedding_dim) else: emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) else: # If|R(C)| \not= \emptyset, then take the mean of individuals. - str_idx = [i.get_iri().as_str() for i in rl_state.instances] - assert len(str_idx) > 0 - if self.instance_embeddings is not None: - emb = torch.tensor(self.instance_embeddings.loc[str_idx].values, dtype=torch.float32) + str_individuals = [i.get_iri().as_str() for i in rl_state.instances] + assert len(str_individuals) > 0 + if self.pre_trained_kge is not None: + emb = self.pre_trained_kge.get_entity_embeddings(str_individuals) emb = torch.mean(emb, dim=0) - emb = emb.view(1, self.sample_size, self.instance_embeddings.shape[1]) + emb = emb.view(1, self.sample_size, self.embedding_dim) else: emb = torch.rand(size=(1, self.sample_size, self.embedding_dim)) # (6) Assign embeddings @@ -808,9 +820,9 @@ def __init__(self, args): self.loss = torch.nn.MSELoss() # Conv1D seems to be faster than Conv2d self.conv1 = torch.nn.Conv1d(in_channels=4, - out_channels=args['first_out_channels'], - kernel_size=args['kernel_size'], - padding=1, stride=1, bias=True) + out_channels=args['first_out_channels'], + kernel_size=args['kernel_size'], + padding=1, stride=1, bias=True) # Fully connected layers. self.size_of_fc1 = int(args['first_out_channels'] * self.embedding_dim) @@ -835,4 +847,3 @@ def forward(self, X: torch.FloatTensor): # N x 1 scores = self.fc2(X).flatten() return scores - diff --git a/requirements.txt b/requirements.txt index 270a9325..d1d7a1e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ tqdm>=4.64.0 tokenizers>=0.12.1 transformers>=4.19.2 requests>=2.31.0 -owlapy>=0.1.0 \ No newline at end of file +owlapy>=0.1.0 +dicee>=0.1.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 1fe451d4..8a8349d8 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,8 @@ "tokenizers>=0.12.1", "transformers>=4.19.2", "pytest>=7.2.2", - "owlapy>=0.1.0"], + "owlapy>=0.1.0", + "dicee>=0.1.2"], author='Caglar Demir', author_email='caglardemir8@gmail.com', url='https://github.com/dice-group/Ontolearn', diff --git a/tests/test_learners_regression.py b/tests/test_learners_regression.py new file mode 100644 index 00000000..fe259563 --- /dev/null +++ b/tests/test_learners_regression.py @@ -0,0 +1,49 @@ +import json +import random +import unittest +from ontolearn.learning_problem import PosNegLPStandard +from owlapy.model import OWLNamedIndividual, IRI + +from ontolearn.knowledge_base import KnowledgeBase +from ontolearn.concept_learner import EvoLearner, CELOE, OCEL +from ontolearn.learners import Drill +from ontolearn.metrics import F1 + +import os +import time +from owlapy.model import OWLNamedIndividual, IRI + + +class TestConceptLearnerReg: + + def test_regression_family(self): + with open('examples/synthetic_problems.json') as json_file: + settings = json.load(json_file) + kb = KnowledgeBase(path=settings['data_path'][3:]) + max_runtime=10 + + ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=max_runtime) + celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=max_runtime) + evo = EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=max_runtime) + drill = Drill(knowledge_base=kb, quality_func=F1(), max_runtime=max_runtime) + + drill_quality=[] + celoe_quality=[] + ocel_quality=[] + evo_quality=[] + + for str_target_concept, examples in settings['problems'].items(): + pos = set(map(OWLNamedIndividual, map(IRI.create, set(examples['positive_examples'])))) + neg = set(map(OWLNamedIndividual, map(IRI.create, set(examples['negative_examples'])))) + print('Target concept: ', str_target_concept) + + lp = PosNegLPStandard(pos=pos, neg=neg) + # Untrained & max runtime is not fully integrated. + ocel_quality.append(ocel.fit(lp).best_hypotheses(n=1).quality) + celoe_quality.append(celoe.fit(lp).best_hypotheses(n=1).quality) + evo_quality.append(evo.fit(lp).best_hypotheses(n=1).quality) + drill_quality.append(drill.fit(lp).best_hypotheses(n=1).quality) + + + assert sum(evo_quality)>sum(drill_quality)>sum(celoe_quality)>sum(ocel_quality) + From f5c41dd1827980288101fe571c25f953b4ec4493 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 19:44:40 +0100 Subject: [PATCH 11/31] Fix dicee import error --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 36452d82..01f5a4e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ install_requires = tqdm>=4.64.0 tokenizers>=0.12.1 transformers>=4.19.2 + dicee>=0.1.2 [options.extras_require] test = From 3b1247e2d67607f05f2d9e8b5cc067997b19b33d Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 19:50:42 +0100 Subject: [PATCH 12/31] Fix dicee import error --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 8f533a75..4989e7aa 100644 --- a/environment.yml +++ b/environment.yml @@ -17,6 +17,7 @@ dependencies: - httpx=0.21.1 - parsimonious=0.8.1 - tqdm=4.64.0 + - dicee==0.1.2 # testing - tox=3.24.3 - tox-conda=0.8.3 From 2dae58074f9216f60d21e2ed2d9c2bac2407f9f6 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 19:52:23 +0100 Subject: [PATCH 13/31] Fix dicee import error --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 4989e7aa..0e79b1db 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,6 @@ dependencies: - httpx=0.21.1 - parsimonious=0.8.1 - tqdm=4.64.0 - - dicee==0.1.2 # testing - tox=3.24.3 - tox-conda=0.8.3 @@ -34,6 +33,7 @@ dependencies: # PIP - pip=21.0.1 - pip: + - dicee==0.1.2 - owlapy == 0.1.0 - tokenizers==0.12.1 - transformers==4.19.2 From 0498ca74ed683e6d7be143596283354171809010 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 19:59:03 +0100 Subject: [PATCH 14/31] conda removed from testing --- .github/workflows/test.yml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c9aefeb3..85607ffd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9"] + python-version: ["3.9.18"] max-parallel: 5 steps: - uses: actions/checkout@v3 @@ -15,14 +15,18 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - - name: Set up Conda - uses: conda-incubator/setup-miniconda@v2 - with: - environment-file: environment.yml + # - name: Set up Conda + # uses: conda-incubator/setup-miniconda@v2 + # with: + # environment-file: environment.yml + # conda run -n test pytest -p no:warnings -x + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt - name: Test with pytest run: | wget https://files.dice-research.org/projects/Ontolearn/KGs.zip unzip KGs.zip - conda run -n test pytest -p no:warnings -x \ No newline at end of file + pytest -p no:warnings -x \ No newline at end of file From eecabc6f1821c47b1cc3c7efbfd2c7694d9d38d9 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 20:13:53 +0100 Subject: [PATCH 15/31] Potential fix for CondaEnvException: Pip failed --- README.md | 4 ++-- environment.yml | 2 +- requirements.txt | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6a75ab71..9453917b 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ pip install ontolearn ``` or ```shell -git clone https://github.com/dice-group/Ontolearn.git && conda create --name onto python=3.9.18 && conda activate onto -pip3 install -e . && python -c "import ontolearn" +git clone https://github.com/dice-group/Ontolearn.git +conda create --name onto python=3.9.18 && conda activate onto && pip3 install -e . && python -c "import ontolearn" wget https://files.dice-research.org/projects/Ontolearn/KGs.zip -O ./KGs.zip && unzip KGs.zip ``` ```shell diff --git a/environment.yml b/environment.yml index 0e79b1db..e4853488 100644 --- a/environment.yml +++ b/environment.yml @@ -33,7 +33,7 @@ dependencies: # PIP - pip=21.0.1 - pip: - - dicee==0.1.2 + - dicee == 0.1.2 - owlapy == 0.1.0 - tokenizers==0.12.1 - transformers==4.19.2 diff --git a/requirements.txt b/requirements.txt index d1d7a1e3..33365eeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ tokenizers>=0.12.1 transformers>=4.19.2 requests>=2.31.0 owlapy>=0.1.0 -dicee>=0.1.2 \ No newline at end of file +dicee>=0.1.2 +pytest>=7.2.2 \ No newline at end of file From 112814949f2cfd982fc0980b2a35b613ead7dc26 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Tue, 5 Dec 2023 20:20:55 +0100 Subject: [PATCH 16/31] dicee is removed --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index e4853488..8f533a75 100644 --- a/environment.yml +++ b/environment.yml @@ -33,7 +33,6 @@ dependencies: # PIP - pip=21.0.1 - pip: - - dicee == 0.1.2 - owlapy == 0.1.0 - tokenizers==0.12.1 - transformers==4.19.2 From fb28d9491541441de473fb6961a5cd5903ba92f4 Mon Sep 17 00:00:00 2001 From: Alkid Date: Wed, 6 Dec 2023 14:39:46 +0100 Subject: [PATCH 17/31] Fixed dependencies --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 8f533a75..8bf41366 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,6 @@ dependencies: - rdflib=6.0.2 - scikit-learn=1.0.2 - sortedcontainers=2.4.0 - - owlready2=0.40 - flask=1.1.2 - deap=1.3.1 - httpx=0.21.1 @@ -33,7 +32,8 @@ dependencies: # PIP - pip=21.0.1 - pip: - - owlapy == 0.1.0 + - owlready2==0.40 + - owlapy==0.1.0 - tokenizers==0.12.1 - transformers==4.19.2 # testing From 319d939c88304a9b26667056d5a19df1e27ebe38 Mon Sep 17 00:00:00 2001 From: Alkid Date: Wed, 6 Dec 2023 15:07:58 +0100 Subject: [PATCH 18/31] Fixed output printing for nces --- ontolearn/model_adapter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ontolearn/model_adapter.py b/ontolearn/model_adapter.py index c9100336..2f800e0a 100644 --- a/ontolearn/model_adapter.py +++ b/ontolearn/model_adapter.py @@ -388,8 +388,7 @@ def execute(args): print(list(trainer.best_hypotheses(1)).pop()) elif args.model in ["nces"]: - hypothesis = model.fit(pos, neg) - report = "Prediction: " + DLSyntaxObjectRenderer().render(hypothesis) + "Quality: " + \ - compute_quality(kb, hypothesis, pos, neg, args.quality_metric) + "Individuals: " + \ - kb.individuals_count(hypothesis) + hypothesis = model.fit(pos, neg) # This will also print the prediction + report = f"Quality: {compute_quality(kb, hypothesis, pos, neg, args.quality_metric)} \nIndividuals: " + \ + f"{kb.individuals_count(hypothesis)}" print(report) From 65aacbd8563b4619a56fc3d617207066ba6fbe33 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 6 Dec 2023 20:31:55 +0100 Subject: [PATCH 19/31] WIP:DRILL training is available --- examples/concept_learning_evaluation.py | 20 +- ontolearn/data_struct.py | 5 +- ontolearn/learners/drill.py | 311 +++++++++++++++--------- ontolearn/refinement_operators.py | 11 +- 4 files changed, 202 insertions(+), 145 deletions(-) diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index cf5efadb..db368924 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -14,20 +14,15 @@ def dl_concept_learning(args): - try: - os.chdir("examples") - except FileNotFoundError: - pass - with open(args.lps) as json_file: settings = json.load(json_file) kb = KnowledgeBase(path=args.kb) - - ocel=OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) - celoe=CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) - evo=EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) - drill=Drill(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) + drill = Drill(knowledge_base=kb, path_pretrained_kge=args.path_pretrained_kge, quality_func=F1(), + max_runtime=args.max_runtime).train(num_episode=1, num_learning_problems=1) + ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) + celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) + evo = EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) columns = ["LP", "OCEL", "F1-OCEL", "RT-OCEL", "CELOE", "F1-CELOE", "RT-CELOE", @@ -45,11 +40,9 @@ def dl_concept_learning(args): lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg) start_time = time.time() - # Untrained & max runtime is not fully integrated. pred_drill = drill.fit(lp).best_hypotheses(n=1) rt_drill = time.time() - start_time - start_time = time.time() pred_ocel = ocel.fit(lp).best_hypotheses(n=1) rt_ocel = time.time() - start_time @@ -80,6 +73,5 @@ def dl_concept_learning(args): parser.add_argument("--max_runtime", type=int, default=10) parser.add_argument("--lps", type=str, default="synthetic_problems.json") parser.add_argument("--kb", type=str, default="../KGs/Family/family-benchmark_rich_background.owl") - parser.add_argument("--path_pretrained_kge", type=str, default="../KeciFamilyRun") - + parser.add_argument("--path_pretrained_kge", type=str, default=None) dl_concept_learning(parser.parse_args()) diff --git a/ontolearn/data_struct.py b/ontolearn/data_struct.py index 31f82746..f956794d 100644 --- a/ontolearn/data_struct.py +++ b/ontolearn/data_struct.py @@ -12,8 +12,6 @@ def __init__(self, current_state: torch.FloatTensor, next_state_batch: torch.Flo n: torch.FloatTensor): assert len(p) > 0 and len(n) > 0 num_next_states = len(next_state_batch) - - current_state = current_state.repeat(num_next_states, 1, 1) p = p.repeat((num_next_states, 1, 1)) n = n.repeat((num_next_states, 1, 1)) @@ -63,8 +61,9 @@ def __init__(self, current_state_batch: torch.Tensor, next_state_batch: torch.Te assert self.S.shape == self.S_Prime.shape == self.Positives.shape == self.Negatives.shape assert self.S.dtype == self.S_Prime.dtype == self.Positives.dtype == self.Negatives.dtype == torch.float32 self.X = torch.cat([self.S, self.S_Prime, self.Positives, self.Negatives], 1) + num_points, depth, dim = self.X.shape - self.X = self.X.view(num_points, depth, 1, dim) + # self.X = self.X.view(num_points, depth, 1, dim) # X[0] => corresponds to a data point, X[0] \in R^{4 \times 1 \times dim} # where X[0][0] => current state representation R^{1 \times dim} # where X[0][1] => next state representation R^{1 \times dim} diff --git a/ontolearn/learners/drill.py b/ontolearn/learners/drill.py index 9087d516..21f10ff3 100644 --- a/ontolearn/learners/drill.py +++ b/ontolearn/learners/drill.py @@ -9,11 +9,17 @@ from ontolearn.data_struct import Experience from ontolearn.search import DRILLSearchTreePriorityQueue from ontolearn.utils import create_experiment_folder -from collections import Counter +from collections import Counter, deque from itertools import chain import time import dicee import os +from owlapy.render import DLSyntaxObjectRenderer +from ontolearn.metrics import F1 +import random +from ontolearn.heuristics import Reward +import torch +from ontolearn.data_struct import PrepareBatchOfTraining, PrepareBatchOfPrediction class Drill(RefinementBasedConceptLearner): @@ -34,21 +40,24 @@ def __init__(self, knowledge_base, use_card_restrictions=True, card_limit=10, quality_func: AbstractScorer = None, - reward_func=None, - batch_size=None, num_workers=None, pretrained_model_name=None, - iter_bound=None, max_num_of_concepts_tested=None, verbose=None, terminate_on_goal=None, - max_len_replay_memory=None, epsilon_decay=None, epsilon_min=None, num_epochs_per_replay=None, - num_episodes_per_replay=None, learning_rate=None, max_runtime=None, num_of_sequential_actions=None, - num_episode=None): - - print("***DRILL has not yet been fully integrated***") + reward_func: object = None, + batch_size=None, num_workers: int = 1, pretrained_model_name=None, + iter_bound=None, max_num_of_concepts_tested=None, verbose: int = 0, terminate_on_goal=None, + max_len_replay_memory=256, + epsilon_decay: float = 0.01, epsilon_min: float = 0.0, + num_epochs_per_replay: int = 100, + num_episodes_per_replay: int = 2, learning_rate: float = 0.001, + max_runtime=None, + num_of_sequential_actions=3, + num_episode=10): + self.name = "DRILL" if path_pretrained_kge is not None and os.path.isdir(path_pretrained_kge): self.pre_trained_kge = dicee.KGE(path=path_pretrained_kge) self.embedding_dim = self.pre_trained_kge.configs["embedding_dim"] else: self.pre_trained_kge = None - self.embedding_dim = 32 + self.embedding_dim = 12 if path_pretrained_drill is not None and os.path.isdir(path_pretrained_drill): raise NotImplementedError() @@ -61,33 +70,40 @@ def __init__(self, knowledge_base, use_card_restrictions=use_card_restrictions, card_limit=card_limit, use_inverse=use_inverse) - self.reward_func = reward_func + else: + refinement_operator = refinement_operator + if reward_func is None: + self.reward_func = Reward() + else: + self.reward_func = reward_func + self.representation_mode = "averaging" - self.heuristic_func = None + self.sample_size = 1 + self.heuristic_func = DrillHeuristic(mode=self.representation_mode, + model_args={'input_shape': (4 * self.sample_size, self.embedding_dim), + 'first_out_channels': 32, + 'second_out_channels': 16, 'third_out_channels': 8, + 'kernel_size': 3}) + self.num_workers = num_workers - self.epsilon = 1 - self.learning_rate = .001 - self.num_episode = 1 - self.num_of_sequential_actions = 3 - self.num_epochs_per_replay = 1 - self.max_len_replay_memory = 256 - self.epsilon_decay = 0.01 - self.epsilon_min = 0 - self.batch_size = 1024 - self.verbose = 0 - self.num_episodes_per_replay = 2 + self.learning_rate = learning_rate + self.num_episode = num_episode + self.num_of_sequential_actions = num_of_sequential_actions + self.num_epochs_per_replay = num_epochs_per_replay + self.max_len_replay_memory = max_len_replay_memory + self.epsilon_decay = epsilon_decay + self.epsilon_min = epsilon_min + self.batch_size = batch_size + self.verbose = verbose + self.num_episodes_per_replay = num_episodes_per_replay self.seen_examples = dict() self.emb_pos, self.emb_neg = None, None self.start_time = None self.goal_found = False self.experiences = Experience(maxlen=self.max_len_replay_memory) - self.sample_size = 1 - self.heuristic_func = DrillHeuristic(mode=self.representation_mode, - model_args={'input_shape': (4 * self.sample_size, self.embedding_dim), - 'first_out_channels': 32, - 'second_out_channels': 16, 'third_out_channels': 8, - 'kernel_size': 3}) + self.epsilon = 1 + if self.learning_rate: self.optimizer = torch.optim.Adam(self.heuristic_func.net.parameters(), lr=self.learning_rate) @@ -105,11 +121,12 @@ def __init__(self, knowledge_base, iter_bound=iter_bound, max_num_of_concepts_tested=max_num_of_concepts_tested, max_runtime=max_runtime) - print('Number of parameters: ', sum([p.numel() for p in self.heuristic_func.net.parameters()])) - self.search_tree = DRILLSearchTreePriorityQueue() - self._learning_problem = None self.storage_path, _ = create_experiment_folder() + self._learning_problem = None + self.renderer = DLSyntaxObjectRenderer() + + self.operator: RefinementBasedConceptLearner def best_hypotheses(self, n=1): assert self.search_tree is not None @@ -220,8 +237,7 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): try: assert len(next_possible_states) > 0 except AssertionError: - if self.verbose > 1: - logger.info(f'DEAD END at {most_promising}') + print(f'DEAD END at {most_promising}') continue if len(next_possible_states) == 0: # We do not need to compute Q value based on embeddings of "zeros". @@ -239,10 +255,10 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): return self.terminate() def show_search_tree(self, heading_step: str, top_n: int = 10) -> None: - ValueError('show_search_tree') + assert ValueError('show_search_tree') def terminate_training(self): - ValueError('terminate_training') + return self def fit_from_iterable(self, dataset: List[Tuple[object, Set[OWLNamedIndividual], Set[OWLNamedIndividual]]], @@ -257,9 +273,8 @@ def fit_from_iterable(self, results = [] for (target_ce, p, n) in dataset: - if self.verbose > 0: - logger.info(f'TARGET OWL CLASS EXPRESSION:\n{target_ce}') - logger.info(f'|Sampled Positive|:{len(p)}\t|Sampled Negative|:{len(n)}') + print(f'TARGET OWL CLASS EXPRESSION:\n{target_ce}') + print(f'|Sampled Positive|:{len(p)}\t|Sampled Negative|:{len(n)}') start_time = time.time() self.fit(pos=p, neg=n, max_runtime=max_runtime) rn = time.time() - start_time @@ -287,23 +302,25 @@ def init_training(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedI """ (2) Update REWARD FUNC FOR each learning problem """ self.reward_func.lp = self._learning_problem """ (3) Obtain embeddings of positive and negative examples """ - self.emb_pos = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in pos_uri]].values, - dtype=torch.float32) - self.emb_neg = torch.tensor( - self.instance_embeddings.loc[[owl_indv.get_iri().as_str() for owl_indv in neg_uri]].values, - dtype=torch.float32) - """ (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini - batching """ - self.emb_pos = torch.mean(self.emb_pos, dim=0) - self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) - self.emb_neg = torch.mean(self.emb_neg, dim=0) - self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) - # Sanity checking - if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): - raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) - if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): - raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) + if self.pre_trained_kge is not None: + self.emb_pos = self.pre_trained_kge.get_entity_embeddings( + [owl_individual.get_iri().as_str() for owl_individual in pos_uri]) + self.emb_neg = self.pre_trained_kge.get_entity_embeddings( + [owl_individual.get_iri().as_str() for owl_individual in neg_uri]) + """ (3) Take the mean of positive and negative examples and reshape it into (1,1,embedding_dim) for mini + batching """ + self.emb_pos = torch.mean(self.emb_pos, dim=0) + self.emb_pos = self.emb_pos.view(1, 1, self.emb_pos.shape[0]) + self.emb_neg = torch.mean(self.emb_neg, dim=0) + self.emb_neg = self.emb_neg.view(1, 1, self.emb_neg.shape[0]) + # Sanity checking + if torch.isnan(self.emb_pos).any() or torch.isinf(self.emb_pos).any(): + raise ValueError('invalid value detected in E+,\n{0}'.format(self.emb_pos)) + if torch.isnan(self.emb_neg).any() or torch.isinf(self.emb_neg).any(): + raise ValueError('invalid value detected in E-,\n{0}'.format(self.emb_neg)) + else: + self.emb_pos = None + self.emb_neg = None # Default exploration exploitation tradeoff. """ (3) Default exploration exploitation tradeoff and number of expression tested """ @@ -336,6 +353,7 @@ def apply_refinement(self, rl_state: RL_State) -> Generator: 3. Return Generator. """ assert isinstance(rl_state, RL_State) + self.operator: LengthBasedRefinement # 1. for i in self.operator.refine(rl_state.concept): # O(N) yield self.create_rl_state(i, parent_node=rl_state) @@ -365,7 +383,7 @@ def learn_from_illustration(self, sequence_of_goal_path: List[RL_State]): self.form_experiences(sequence_of_states, rewards) self.learn_from_replay_memory() - def rl_learning_loop(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual], + def rl_learning_loop(self, num_episode: int, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual], goal_path: List[RL_State] = None) -> List[float]: """ Standard RL training loop. @@ -376,41 +394,41 @@ def rl_learning_loop(self, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNam 2. Training Loop. """ """ (1) Initialize RL environment for training """ + assert isinstance(pos_uri, Set) and isinstance(neg_uri, Set) self.init_training(pos_uri=pos_uri, neg_uri=neg_uri) root_rl_state = self.create_rl_state(self.start_class, is_root=True) self.compute_quality_of_class_expression(root_rl_state) sum_of_rewards_per_actions = [] - log_every_n_episodes = int(self.num_episode * .1) + 1 """ (2) Learn from an illustration if possible """ if goal_path: self.learn_from_illustration(goal_path) """ (3) Reinforcement Learning offline training loop """ - for th in range(self.num_episode): + for th in range(num_episode): """ (3.1) Sequence of decisions """ sequence_of_states, rewards = self.sequence_of_actions(root_rl_state) - if self.verbose >= 10: - logger.info('#' * 10, end='') - logger.info(f'{th}\t.th Sequence of Actions', end='') - logger.info('#' * 10) - for step, (current_state, next_state) in enumerate(sequence_of_states): - logger.info(f'{step}. Transition \n{current_state}\n----->\n{next_state}') - logger.info(f'Reward:{rewards[step]}') - - if th % log_every_n_episodes == 0: - if self.verbose >= 1: - logger.info('{0}.th iter. SumOfRewards: {1:.2f}\t' - 'Epsilon:{2:.2f}\t' - '|ReplayMem.|:{3}'.format(th, sum(rewards), - self.epsilon, - len(self.experiences))) + """ + + print('#' * 10, end='') + print(f'\t{th}.th Sequence of Actions\t', end='') + print('#' * 10) + for step, (current_state, next_state) in enumerate(sequence_of_states): + print(f'{step}. Transition \n{current_state}\n----->\n{next_state}') + print(f'Reward:{rewards[step]}') + + print('{0}.th iter. SumOfRewards: {1:.2f}\t' + 'Epsilon:{2:.2f}\t' + '|ReplayMem.|:{3}'.format(th, sum(rewards), + self.epsilon, + len(self.experiences))) + """ """(3.2) Form experiences""" self.form_experiences(sequence_of_states, rewards) sum_of_rewards_per_actions.append(sum(rewards)) """(3.2) Learn from experiences""" - if th % self.num_episodes_per_replay == 0: - self.learn_from_replay_memory() + # if th % self.num_episodes_per_replay == 0: + self.learn_from_replay_memory() """(3.4) Exploration Exploitation""" if self.epsilon < 0: break @@ -464,8 +482,7 @@ def form_experiences(self, state_pairs: List, rewards: List) -> None: y - Argmax Q value. """ - if self.verbose > 1: - print('Form Experiences for the training') + print('Form Experiences for the training') for th, consecutive_states in enumerate(state_pairs): e, e_next = consecutive_states @@ -476,13 +493,15 @@ def learn_from_replay_memory(self) -> None: """ Learning by replaying memory. """ - if self.verbose > 1: - print('Learn from Experience') - - current_state_batch, next_state_batch, q_values = self.experiences.retrieve() + print('learn_from_replay_memory', end="\t|\t") + current_state_batch: List[torch.FloatTensor] + next_state_batch: List[torch.FloatTensor] + current_state_batch, next_state_batch, y = self.experiences.retrieve() + # N, 1, dim current_state_batch = torch.cat(current_state_batch, dim=0) + # N, 1, dim next_state_batch = torch.cat(next_state_batch, dim=0) - q_values = torch.Tensor(q_values) + y = torch.Tensor(y) try: assert current_state_batch.shape[1] == next_state_batch.shape[1] == self.emb_pos.shape[1] == \ @@ -499,6 +518,14 @@ def learn_from_replay_memory(self) -> None: assert current_state_batch.shape[2] == next_state_batch.shape[2] == self.emb_pos.shape[2] == self.emb_neg.shape[ 2] + + num_next_states = len(current_state_batch) + + # batch, 4, dim + X = torch.cat([current_state_batch, next_state_batch, self.emb_pos.repeat((num_next_states, 1, 1)), + self.emb_neg.repeat((num_next_states, 1, 1))], 1) + """ + # We can skip this part perhaps dataset = PrepareBatchOfTraining(current_state_batch=current_state_batch, next_state_batch=next_state_batch, p=self.emb_pos, n=self.emb_neg, q=q_values) @@ -506,27 +533,27 @@ def learn_from_replay_memory(self) -> None: data_loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers) - if self.verbose > 1: - print(f'Number of experiences:{num_experience}') - print('DQL agent is learning via experience replay') + """ + + print(f'Experiences:{X.shape}', end="\t|\t") self.heuristic_func.net.train() + total_loss = 0 for m in range(self.num_epochs_per_replay): - total_loss = 0 - for X, y in data_loader: - self.optimizer.zero_grad() # zero the gradient buffers - # forward - predicted_q = self.heuristic_func.net.forward(X) - # loss - loss = self.heuristic_func.net.loss(predicted_q, y) - total_loss += loss.item() - # compute the derivative of the loss w.r.t. the parameters using backpropagation - loss.backward() - # clip gradients if gradients are killed. =>torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) - self.optimizer.step() - if self.verbose > 1: - print(f'{m}.th Epoch average loss during training:{total_loss / num_experience}') - - self.heuristic_func.net.train().eval() + self.optimizer.zero_grad() # zero the gradient buffers + # forward + # n by 4, dim + predicted_q = self.heuristic_func.net.forward(X) + # loss + loss = self.heuristic_func.net.loss(predicted_q, y) + total_loss += loss.item() + # compute the derivative of the loss w.r.t. the parameters using backpropagation + loss.backward() + # clip gradients if gradients are killed. =>torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) + self.optimizer.step() + + print(f'Average loss during training: {total_loss / self.num_epochs_per_replay:0.5f}') + + self.heuristic_func.net.eval() def update_search(self, concepts, predicted_Q_values=None): """ @@ -642,7 +669,7 @@ def exploration_exploitation_tradeoff(self, current_state: AbstractNode, (1) Exploration. (2) Exploitation. """ - if np.random.random() < self.epsilon: + if random.random() < self.epsilon: next_state = random.choice(next_states) self.assign_embeddings(next_state) else: @@ -710,7 +737,52 @@ def retrieve_concept_chain(rl_state: RL_State) -> List[RL_State]: hierarchy.appendleft(rl_state) return list(hierarchy) - def train(self, dataset: Iterable[Tuple[str, Set, Set]], relearn_ratio: int = 2): + def generate_learning_problems(self, dataset: Optional[Iterable[Tuple[str, Set, Set]]] = None, + num_learning_problems: int = 5) -> Iterable[ + Tuple[str, Set, Set]]: + """ Generate learning problems if none is provided. + + Time complexity: O(n^2) n = named concepts + """ + + if dataset is None: + learning_problems = [] + counter = 0 + size_of_examples = 3 + print("Generating learning problems...") + for i in self.kb.get_concepts(): + individuals_i = set(self.kb.individuals(i)) + + if len(individuals_i) > size_of_examples: + str_dl_concept_i = self.renderer.render(i) + for j in self.kb.get_concepts(): + if i == j: + continue + individuals_j = set(self.kb.individuals(j)) + if len(individuals_j) < size_of_examples: + continue + + lp = (str_dl_concept_i, + set(random.sample(individuals_i, size_of_examples)), + set(random.sample(individuals_j, size_of_examples))) + yield lp + counter += 1 + + if counter == num_learning_problems: + break + + if counter == num_learning_problems: + break + else: + """Empy concept""" + + # assert isinstance(learning_problems, Iterable) + # return learning_problems + else: + return dataset + + def train(self, dataset: Optional[Iterable[Tuple[str, Set, Set]]] = None, num_episode: int = 10, + relearn_ratio: int = 2, num_learning_problems=3): """ Train RL agent on learning problems with relearn_ratio. @@ -730,8 +802,10 @@ def train(self, dataset: Iterable[Tuple[str, Set, Set]], relearn_ratio: int = 2) Returns: self. """ - if self.verbose > 0: - logger.info(f'Training starts.\nNumber of learning problem:{len(dataset)},\t Relearn ratio:{relearn_ratio}') + if self.pre_trained_kge is None: + return self.terminate_training() + + dataset = self.generate_learning_problems(dataset, num_learning_problems) counter = 1 renderer = DLSyntaxObjectRenderer() @@ -739,23 +813,18 @@ def train(self, dataset: Iterable[Tuple[str, Set, Set]], relearn_ratio: int = 2) for _ in range(relearn_ratio): for (target_owl_ce, positives, negatives) in dataset: - if self.verbose > 0: - logger.info( - 'Goal Concept:{0}\tE^+:[{1}] \t E^-:[{2}]'.format(target_owl_ce, - len(positives), len(negatives))) - logger.info(f'RL training on {counter}.th learning problem starts') + print('Goal Concept:{0}\tE^+:[{1}] \t E^-:[{2}]'.format(target_owl_ce, + len(positives), len(negatives))) + print(f'RL training on {counter}.th learning problem {target_owl_ce} starts') - goal_path = list(reversed(self.retrieve_concept_chain(target_owl_ce))) - # goal_path: [⊤, Daughter, Daughter ⊓ Mother] - sum_of_rewards_per_actions = self.rl_learning_loop(pos_uri=positives, neg_uri=negatives, - goal_path=goal_path) + sum_of_rewards_per_actions = self.rl_learning_loop(num_episode=num_episode, pos_uri=positives, + neg_uri=negatives) - if self.verbose > 2: - logger.info(f'Sum of Rewards in first 3 trajectory:{sum_of_rewards_per_actions[:3]}') - logger.info(f'Sum of Rewards in last 3 trajectory:{sum_of_rewards_per_actions[:3]}') + print(f'Sum of Rewards in first 3 trajectory:{sum_of_rewards_per_actions[:3]}') + print(f'Sum of Rewards in last 3 trajectory:{sum_of_rewards_per_actions[:3]}') self.seen_examples.setdefault(counter, dict()).update( - {'Concept': renderer.render(target_owl_ce.concept), + {'Concept': target_owl_ce, 'Positives': [i.get_iri().as_str() for i in positives], 'Negatives': [i.get_iri().as_str() for i in negatives]}) @@ -840,10 +909,10 @@ def forward(self, X: torch.FloatTensor): X n by 4 by d float tensor """ # N x 32 x D - X = F.relu(self.conv1(X)) + X = torch.nn.functional.relu(self.conv1(X)) X = X.flatten(start_dim=1) # N x (32D/2) - X = F.relu(self.fc1(X)) + X = torch.nn.functional.relu(self.fc1(X)) # N x 1 scores = self.fc2(X).flatten() return scores diff --git a/ontolearn/refinement_operators.py b/ontolearn/refinement_operators.py index dd23e33a..0417738b 100644 --- a/ontolearn/refinement_operators.py +++ b/ontolearn/refinement_operators.py @@ -44,8 +44,7 @@ def __init__(self, knowledge_base: KnowledgeBase, use_inverse=False, assert num_of_named_classes == len(list(i for i in self.kb.ontology().classes_in_signature())) self.max_len_refinement_top = 5 - self.top_refinements = {ref for ref in self.refine_top()} - print("Top refinements:", len(self.top_refinements)) + self.top_refinements = None # {ref for ref in self.refine_top()} def from_iterables(self, cls, a_operands, b_operands): assert (isinstance(a_operands, Generator) is False) and (isinstance(b_operands, Generator) is False) @@ -237,6 +236,9 @@ def refine_object_intersection_of(self, class_expression: OWLClassExpression) -> def refine(self, class_expression) -> Iterable[OWLClassExpression]: assert isinstance(class_expression, OWLClassExpression) + if self.top_refinements is None: + self.top_refinements = {ref for ref in self.refine_top()} + if class_expression.is_owl_thing(): yield from self.top_refinements elif class_expression.is_owl_nothing(): @@ -263,11 +265,6 @@ def refine(self, class_expression) -> Iterable[OWLClassExpression]: else: raise ValueError(f"{type(class_expression)} objects are not yet supported") - """ - - - """ - class ModifiedCELOERefinement(BaseRefinement[OENode]): """ From d66fd2ef5a9ec68f5261b295d97ef52db5b0bb32 Mon Sep 17 00:00:00 2001 From: Alkid Date: Wed, 6 Dec 2023 23:34:18 +0100 Subject: [PATCH 20/31] Added option to include implicit individuals --- ontolearn/knowledge_base.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/ontolearn/knowledge_base.py b/ontolearn/knowledge_base.py index 75e2ee5a..a552daaa 100644 --- a/ontolearn/knowledge_base.py +++ b/ontolearn/knowledge_base.py @@ -2,6 +2,7 @@ import logging import random +from itertools import chain from typing import Iterable, Optional, Callable, overload, Union, FrozenSet, Set, Dict from ontolearn.base import OWLOntology_Owlready2, OWLOntologyManager_Owlready2, OWLReasoner_Owlready2 from ontolearn.base.fast_instance_checker import OWLReasoner_FastInstanceChecker @@ -63,6 +64,8 @@ class KnowledgeBase(AbstractKnowledgeBase): reasoner of this object, if you enter a reasoner using :arg:`reasoner_factory` or :arg:`reasoner` argument it will override this setting. triplestore_address: The address where the triplestore is hosted. + include_implicit_individuals: Whether to identify and consider instances which are not set as OWL Named + Individuals (does not contain this type) as individuals. Attributes: generator (ConceptGenerator): Instance of concept generator. @@ -104,7 +107,8 @@ def __init__(self, *, length_metric_factory: Optional[Factory[[], OWLClassExpressionLengthMetric]] = None, individuals_cache_size=128, triplestore_address: str = None, - backend_store: bool = False): + backend_store: bool = False, + include_implicit_individuals=False): ... @overload @@ -136,7 +140,8 @@ def __init__(self, *, backend_store: bool = False, class_hierarchy: Optional[ClassHierarchy] = None, object_property_hierarchy: Optional[ObjectPropertyHierarchy] = None, - data_property_hierarchy: Optional[DatatypePropertyHierarchy] = None + data_property_hierarchy: Optional[DatatypePropertyHierarchy] = None, + include_implicit_individuals=False ): AbstractKnowledgeBase.__init__(self) self.path = path @@ -207,16 +212,18 @@ def __init__(self, *, self._dp_ranges = dict() self.generator = ConceptGenerator() - if isinstance(self._reasoner, OWLReasoner_FastInstanceChecker) and triplestore_address is None: - self._ind_set = self._reasoner._ind_set # performance hack - else: - individuals = self._ontology.individuals_in_signature() - self._ind_set = frozenset(individuals) - self.use_individuals_cache = individuals_cache_size > 0 if self.use_individuals_cache: self._ind_cache = LRUCache(maxsize=individuals_cache_size) + if include_implicit_individuals: + self._ind_set = frozenset(chain.from_iterable(self.individuals(i) for i in self.get_concepts())) + elif isinstance(self._reasoner, OWLReasoner_FastInstanceChecker) and triplestore_address is None: + self._ind_set = self._reasoner._ind_set # performance hack: + else: + individuals = self._ontology.individuals_in_signature() + self._ind_set = frozenset(individuals) + self.describe() def ontology(self) -> OWLOntology: @@ -455,7 +462,7 @@ def _data_properties_for_domain(self, domain: OWLClassExpression, data_propertie yield prop def __repr__(self): - properties_count = iter_count(self.ontology().object_properties_in_signature()) + iter_count( + properties_count = iter_count(self._ind_set) + iter_count( self.ontology().data_properties_in_signature()) class_count = iter_count(self.ontology().classes_in_signature()) individuals_count = self.individuals_count() From e7d65cdbcc59f9e1a305ef441b5b5b9cdd757663 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 00:14:55 +0100 Subject: [PATCH 21/31] Fixes --- ontolearn/knowledge_base.py | 2 +- ontolearn/model_adapter.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ontolearn/knowledge_base.py b/ontolearn/knowledge_base.py index a552daaa..61e2f51d 100644 --- a/ontolearn/knowledge_base.py +++ b/ontolearn/knowledge_base.py @@ -462,7 +462,7 @@ def _data_properties_for_domain(self, domain: OWLClassExpression, data_propertie yield prop def __repr__(self): - properties_count = iter_count(self._ind_set) + iter_count( + properties_count = iter_count(self.ontology().object_properties_in_signature()) + iter_count( self.ontology().data_properties_in_signature()) class_count = iter_count(self.ontology().classes_in_signature()) individuals_count = self.individuals_count() diff --git a/ontolearn/model_adapter.py b/ontolearn/model_adapter.py index bd9326a9..96fe6b1a 100644 --- a/ontolearn/model_adapter.py +++ b/ontolearn/model_adapter.py @@ -4,13 +4,12 @@ import logging import re from typing import TypeVar, List, Optional, Union -from owlapy.render import DLSyntaxObjectRenderer from ontolearn.abstracts import AbstractHeuristic, AbstractScorer, BaseRefinement, AbstractKnowledgeBase, \ AbstractNode from ontolearn.base_concept_learner import BaseConceptLearner from owlapy.model import OWLReasoner, OWLNamedIndividual, OWLClassExpression, OWLAxiom, IRI from ontolearn.base import OWLReasoner_Owlready2_ComplexCEInstances -from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, NCES, Drill +from ontolearn.concept_learner import CELOE, OCEL, EvoLearner, NCES from ontolearn.ea_algorithms import EASimple from ontolearn.ea_initialization import EARandomWalkInitialization, EARandomInitialization, RandomInitMethod from ontolearn.fitness_functions import LinearPressureFitness @@ -33,8 +32,7 @@ models = {'celoe': CELOE, 'ocel': OCEL, 'evolearner': EvoLearner, - 'nces': NCES, - 'drill': Drill} + 'nces': NCES} heuristics = {'celoe': CELOEHeuristic, 'ocel': OCELHeuristic} From cd6249b7aae32430a99884a10c6680c31d5ee952 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 00:58:53 +0100 Subject: [PATCH 22/31] Updated python version to 3.9.18 --- environment.yml | 5 +++-- setup.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index 8bf41366..b651d2d8 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge - pytorch dependencies: - - python=3.8 + - python=3.9.18 - matplotlib=3.3.4 - pandas=1.2.3 - pytorch=1.8.0 @@ -26,12 +26,12 @@ dependencies: # docs - sphinx=4.0.2 - sphinx_rtd_theme=0.5.1 - - myst-parser=0.15.2 # # building - build=0.6.0 # PIP - pip=21.0.1 - pip: + - dicee==0.1.2 - owlready2==0.40 - owlapy==0.1.0 - tokenizers==0.12.1 @@ -43,3 +43,4 @@ dependencies: - sphinx-theme==1.0 - sphinxcontrib-plantuml==0.21 - plantuml-local-client==1.2021.10 + - jinja2==3.1.2 diff --git a/setup.py b/setup.py index dddc6938..f74d8b13 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "Programming Language :: Python :: 3.8", "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", "Topic :: Scientific/Engineering :: Artificial Intelligence"], - python_requires='>=3.8', + python_requires='>=3.9.18', entry_points={"console_scripts": ["ontolearn = ontolearn.run:main"]}, long_description=long_description, long_description_content_type="text/markdown", From 3c30937618e076c111af1ef1e6e29da01cf634b1 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 10:53:46 +0100 Subject: [PATCH 23/31] Potential fix for conda env --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index b651d2d8..b716cb86 100644 --- a/environment.yml +++ b/environment.yml @@ -26,6 +26,7 @@ dependencies: # docs - sphinx=4.0.2 - sphinx_rtd_theme=0.5.1 + - myst-parser==2.0.0 # # building - build=0.6.0 # PIP From f0de612792da561d2eba865df816a83a387a6929 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 11:00:39 +0100 Subject: [PATCH 24/31] Potential fix 2 for conda env --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index b716cb86..115d0ee9 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: # docs - sphinx=4.0.2 - sphinx_rtd_theme=0.5.1 - - myst-parser==2.0.0 + - myst-parser==0.19.0 # # building - build=0.6.0 # PIP From ad92a9fa7baf926f7c8ef6f5ad31409edb0bd559 Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 11:05:25 +0100 Subject: [PATCH 25/31] Potential fix 3 for conda env --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 115d0ee9..d06b459e 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: # docs - sphinx=4.0.2 - sphinx_rtd_theme=0.5.1 - - myst-parser==0.19.0 + - myst-parser==0.15.2 # # building - build=0.6.0 # PIP From a125c7b4236f5fb217482b794c6c79e67432085c Mon Sep 17 00:00:00 2001 From: Alkid Date: Thu, 7 Dec 2023 14:39:50 +0100 Subject: [PATCH 26/31] commented docs workflow temporarily --- .github/workflows/docs.yml | 112 ++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 95f384ef..07b5b5dc 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,56 +1,56 @@ -name: Build docs - -on: - push: - branches: - - master - - develop - pull_request: - -jobs: - docs: - runs-on: ubuntu-latest - strategy: - max-parallel: 5 - - steps: - - uses: actions/checkout@v3 - - name: Install tox - id: install-tox - run: | - pip install tox - echo "tox_version=$(pip list | grep tox | tr -d ' ')" >> $GITHUB_OUTPUT - - name: prepare required software - run: | - # epstopdf & dot & noto-fonts - sudo apt update && sudo apt install texlive-font-utils graphviz fonts-noto - - uses: actions/cache@v3 - with: - key: ${{ runner.os }}-${{ steps.install-tox.outputs.tox_version }}-docs-${{ hashFiles('environment.yml', 'tox.ini') }} - path: .tox - - name: Build docs with tox - run: | - tox -o -e docs - - name: Build latex docs with tox - run: | - tox -o -e docs -- latex - - name: Compile LaTeX document - uses: docker://texlive/texlive:latest - with: - args: make -C docs/_build/latex - - run: | - cp docs/_build/latex/ontolearn.pdf docs/_build/html/ - - name: Deploy to netlify - uses: nwtgck/actions-netlify@v1.2 - with: - publish-dir: 'docs/_build/html' - production-branch: develop - github-token: ${{ secrets.GITHUB_TOKEN }} - deploy-message: "Deploy from GitHub Actions ${{ github.sha }}" - alias: ${{ github.head_ref }} - enable-pull-request-comment: false - enable-commit-comment: false - env: - NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} - NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} - timeout-minutes: 5 +#name: Build docs +# +#on: +# push: +# branches: +# - master +# - develop +# pull_request: +# +#jobs: +# docs: +# runs-on: ubuntu-latest +# strategy: +# max-parallel: 5 +# +# steps: +# - uses: actions/checkout@v3 +# - name: Install tox +# id: install-tox +# run: | +# pip install tox +# echo "tox_version=$(pip list | grep tox | tr -d ' ')" >> $GITHUB_OUTPUT +# - name: prepare required software +# run: | +# # epstopdf & dot & noto-fonts +# sudo apt update && sudo apt install texlive-font-utils graphviz fonts-noto +# - uses: actions/cache@v3 +# with: +# key: ${{ runner.os }}-${{ steps.install-tox.outputs.tox_version }}-docs-${{ hashFiles('environment.yml', 'tox.ini') }} +# path: .tox +# - name: Build docs with tox +# run: | +# tox -o -e docs +# - name: Build latex docs with tox +# run: | +# tox -o -e docs -- latex +# - name: Compile LaTeX document +# uses: docker://texlive/texlive:latest +# with: +# args: make -C docs/_build/latex +# - run: | +# cp docs/_build/latex/ontolearn.pdf docs/_build/html/ +# - name: Deploy to netlify +# uses: nwtgck/actions-netlify@v1.2 +# with: +# publish-dir: 'docs/_build/html' +# production-branch: develop +# github-token: ${{ secrets.GITHUB_TOKEN }} +# deploy-message: "Deploy from GitHub Actions ${{ github.sha }}" +# alias: ${{ github.head_ref }} +# enable-pull-request-comment: false +# enable-commit-comment: false +# env: +# NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} +# NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} +# timeout-minutes: 5 From 8dedb347125347453a368843a97e59386909f8d0 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Thu, 7 Dec 2023 17:19:59 +0100 Subject: [PATCH 27/31] WIP: DRILL training --- examples/concept_learning_evaluation.py | 3 +- ontolearn/heuristics.py | 2 +- ontolearn/learners/drill.py | 253 +++++++++++------------- 3 files changed, 117 insertions(+), 141 deletions(-) diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index db368924..2a98f44f 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -19,7 +19,7 @@ def dl_concept_learning(args): kb = KnowledgeBase(path=args.kb) drill = Drill(knowledge_base=kb, path_pretrained_kge=args.path_pretrained_kge, quality_func=F1(), - max_runtime=args.max_runtime).train(num_episode=1, num_learning_problems=1) + max_runtime=args.max_runtime).train(num_of_target_concepts=2, num_learning_problems=2) ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) evo = EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) @@ -42,7 +42,6 @@ def dl_concept_learning(args): start_time = time.time() pred_drill = drill.fit(lp).best_hypotheses(n=1) rt_drill = time.time() - start_time - start_time = time.time() pred_ocel = ocel.fit(lp).best_hypotheses(n=1) rt_ocel = time.time() - start_time diff --git a/ontolearn/heuristics.py b/ontolearn/heuristics.py index 797e9aaf..f50bd256 100644 --- a/ontolearn/heuristics.py +++ b/ontolearn/heuristics.py @@ -122,7 +122,7 @@ def apply(self, node: LBLNode, instances, learning_problem: EncodedPosNegLPStand node.heuristic = round(heuristic_val, 5) -class Reward: +class CeloeBasedReward: """Reward function for DRILL.""" def __init__(self, reward_of_goal=5.0, beta=.04, alpha=.5): self.name = 'DRILL_Reward' diff --git a/ontolearn/learners/drill.py b/ontolearn/learners/drill.py index 21f10ff3..efbd6a6d 100644 --- a/ontolearn/learners/drill.py +++ b/ontolearn/learners/drill.py @@ -17,7 +17,7 @@ from owlapy.render import DLSyntaxObjectRenderer from ontolearn.metrics import F1 import random -from ontolearn.heuristics import Reward +from ontolearn.heuristics import CeloeBasedReward import torch from ontolearn.data_struct import PrepareBatchOfTraining, PrepareBatchOfPrediction @@ -73,7 +73,7 @@ def __init__(self, knowledge_base, else: refinement_operator = refinement_operator if reward_func is None: - self.reward_func = Reward() + self.reward_func = CeloeBasedReward() else: self.reward_func = reward_func @@ -128,36 +128,6 @@ def __init__(self, knowledge_base, self.operator: RefinementBasedConceptLearner - def best_hypotheses(self, n=1): - assert self.search_tree is not None - assert len(self.search_tree) > 1 - if n == 1: - return [i for i in self.search_tree.get_top_n_nodes(n)][0] - else: - return [i for i in self.search_tree.get_top_n_nodes(n)] - - def clean(self): - self.emb_pos, self.emb_neg = None, None - self.goal_found = False - self.start_time = None - if len(self.search_tree) != 0: - self.search_tree.clean() - - try: - assert len(self.search_tree) == 0 - except AssertionError: - print(len(self.search_tree)) - raise AssertionError('EMPTY search tree') - - self._number_of_tested_concepts = 0 - - def downward_refinement(self, *args, **kwargs): - ValueError('downward_refinement') - - def next_node_to_expand(self) -> RL_State: - """ Return a node that maximizes the heuristic function at time t. """ - return self.search_tree.get_most_promising() - def initialize_class_expression_learning_problem(self, pos: Set[OWLNamedIndividual], neg: Set[OWLNamedIndividual]): """ Determine the learning problem and initialize the search. @@ -358,58 +328,38 @@ def apply_refinement(self, rl_state: RL_State) -> Generator: for i in self.operator.refine(rl_state.concept): # O(N) yield self.create_rl_state(i, parent_node=rl_state) - def learn_from_illustration(self, sequence_of_goal_path: List[RL_State]): - """ - Args: - sequence_of_goal_path: ⊤,Parent,Parent ⊓ Daughter. - """ - current_state = sequence_of_goal_path.pop(0) - rewards = [] - sequence_of_states = [] - while len(sequence_of_goal_path) > 0: - self.assign_embeddings(current_state) - current_state.length = self.kb.concept_len(current_state.concept) - if current_state.quality is None: - self.compute_quality_of_class_expression(current_state) - - next_state = sequence_of_goal_path.pop(0) - self.assign_embeddings(next_state) - next_state.length = self.kb.concept_len(next_state.concept) - if next_state.quality is None: - self.compute_quality_of_class_expression(next_state) - sequence_of_states.append((current_state, next_state)) - rewards.append(self.reward_func.apply(current_state, next_state)) - for x in range(2): - self.form_experiences(sequence_of_states, rewards) - self.learn_from_replay_memory() - - def rl_learning_loop(self, num_episode: int, pos_uri: Set[OWLNamedIndividual], neg_uri: Set[OWLNamedIndividual], + def rl_learning_loop(self, num_episode: int, + pos_uri: Set[OWLNamedIndividual], + neg_uri: Set[OWLNamedIndividual], goal_path: List[RL_State] = None) -> List[float]: - """ - Standard RL training loop. + """ Reinforcement Learning Training Loop - 1. Initialize RL environment for training. + Initialize RL environment for a given learning problem (E^+ pos_iri and E^- neg_iri ) - 2. Learn from an illustration if possible. - 2. Training Loop. + Training: + 2.1 Obtain a trajectory: A sequence of RL states/DL concepts + T, Person, (Female and \forall hasSibling Female). + Rewards at each transition are also computed """ - """ (1) Initialize RL environment for training """ + + # (1) Initialize RL environment for training + print("Reinforcement Learning loop started...") assert isinstance(pos_uri, Set) and isinstance(neg_uri, Set) self.init_training(pos_uri=pos_uri, neg_uri=neg_uri) root_rl_state = self.create_rl_state(self.start_class, is_root=True) self.compute_quality_of_class_expression(root_rl_state) sum_of_rewards_per_actions = [] - """ (2) Learn from an illustration if possible """ - if goal_path: - self.learn_from_illustration(goal_path) - """ (3) Reinforcement Learning offline training loop """ + # () Reinforcement Learning offline training loop for th in range(num_episode): - """ (3.1) Sequence of decisions """ + print(f"Episode {th + 1}: ", end=" ") + # Sequence of decisions + start_time = time.time() sequence_of_states, rewards = self.sequence_of_actions(root_rl_state) - + print(f"Runtime {time.time() - start_time:.3f} secs", end=" | ") + print(f"Max reward: {max(rewards)}", end=" | ") + print(f"Epsilon : {self.epsilon}") """ - print('#' * 10, end='') print(f'\t{th}.th Sequence of Actions\t', end='') print('#' * 10) @@ -423,7 +373,7 @@ def rl_learning_loop(self, num_episode: int, pos_uri: Set[OWLNamedIndividual], n self.epsilon, len(self.experiences))) """ - """(3.2) Form experiences""" + # Form experiences self.form_experiences(sequence_of_states, rewards) sum_of_rewards_per_actions.append(sum(rewards)) """(3.2) Learn from experiences""" @@ -436,6 +386,15 @@ def rl_learning_loop(self, num_episode: int, pos_uri: Set[OWLNamedIndividual], n return sum_of_rewards_per_actions + def select_next_state(self, current_state, next_rl_states) -> Tuple[RL_State, float]: + if True: + next_selected_rl_state = self.exploration_exploitation_tradeoff(current_state, next_rl_states) + return next_selected_rl_state, self.reward_func.apply(current_state, next_selected_rl_state) + else: + for i in next_rl_states: + print(i) + exit(1) + def sequence_of_actions(self, root_rl_state: RL_State) -> Tuple[List[Tuple[AbstractNode, AbstractNode]], List[SupportsFloat]]: assert isinstance(root_rl_state, RL_State) @@ -458,12 +417,12 @@ def sequence_of_actions(self, root_rl_state: RL_State) -> Tuple[List[Tuple[Abstr # assert (current_state.length + 3) <= self.max_child_length print('No next state') break - # (1.3) - next_selected_rl_state = self.exploration_exploitation_tradeoff(current_state, next_rl_states) + next_selected_rl_state, reward = self.select_next_state(current_state, next_rl_states) # (1.4) Remember the concept path path_of_concepts.append((current_state, next_selected_rl_state)) # (1.5) - rewards.append(self.reward_func.apply(current_state, next_selected_rl_state)) + rewards.append(reward) + # (1.6) current_state = next_selected_rl_state return path_of_concepts, rewards @@ -481,9 +440,6 @@ def form_experiences(self, state_pairs: List, rewards: List) -> None: X - A list of embeddings of current concept, next concept, positive examples, negative examples. y - Argmax Q value. """ - - print('Form Experiences for the training') - for th, consecutive_states in enumerate(state_pairs): e, e_next = consecutive_states self.experiences.append( @@ -493,7 +449,7 @@ def learn_from_replay_memory(self) -> None: """ Learning by replaying memory. """ - print('learn_from_replay_memory', end="\t|\t") + # print('learn_from_replay_memory', end="\t|\t") current_state_batch: List[torch.FloatTensor] next_state_batch: List[torch.FloatTensor] current_state_batch, next_state_batch, y = self.experiences.retrieve() @@ -534,14 +490,12 @@ def learn_from_replay_memory(self) -> None: batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers) """ - - print(f'Experiences:{X.shape}', end="\t|\t") + # print(f'Experiences:{X.shape}', end="\t|\t") self.heuristic_func.net.train() total_loss = 0 for m in range(self.num_epochs_per_replay): self.optimizer.zero_grad() # zero the gradient buffers - # forward - # n by 4, dim + # forward: n by 4, dim predicted_q = self.heuristic_func.net.forward(X) # loss loss = self.heuristic_func.net.loss(predicted_q, y) @@ -551,8 +505,7 @@ def learn_from_replay_memory(self) -> None: # clip gradients if gradients are killed. =>torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) self.optimizer.step() - print(f'Average loss during training: {total_loss / self.num_epochs_per_replay:0.5f}') - + # print(f'Average loss during training: {total_loss / self.num_epochs_per_replay:0.5f}') self.heuristic_func.net.eval() def update_search(self, concepts, predicted_Q_values=None): @@ -738,6 +691,7 @@ def retrieve_concept_chain(rl_state: RL_State) -> List[RL_State]: return list(hierarchy) def generate_learning_problems(self, dataset: Optional[Iterable[Tuple[str, Set, Set]]] = None, + num_of_target_concepts: int = 3, num_learning_problems: int = 5) -> Iterable[ Tuple[str, Set, Set]]: """ Generate learning problems if none is provided. @@ -746,10 +700,9 @@ def generate_learning_problems(self, dataset: Optional[Iterable[Tuple[str, Set, """ if dataset is None: - learning_problems = [] counter = 0 size_of_examples = 3 - print("Generating learning problems...") + print("Generating learning problems on the fly...") for i in self.kb.get_concepts(): individuals_i = set(self.kb.individuals(i)) @@ -761,78 +714,102 @@ def generate_learning_problems(self, dataset: Optional[Iterable[Tuple[str, Set, individuals_j = set(self.kb.individuals(j)) if len(individuals_j) < size_of_examples: continue + for _ in range(num_learning_problems): + lp = (str_dl_concept_i, + set(random.sample(individuals_i, size_of_examples)), + set(random.sample(individuals_j, size_of_examples))) + yield lp - lp = (str_dl_concept_i, - set(random.sample(individuals_i, size_of_examples)), - set(random.sample(individuals_j, size_of_examples))) - yield lp counter += 1 - if counter == num_learning_problems: + if counter == num_of_target_concepts: break - - if counter == num_learning_problems: + if counter == num_of_target_concepts: break else: """Empy concept""" - - # assert isinstance(learning_problems, Iterable) - # return learning_problems else: return dataset - def train(self, dataset: Optional[Iterable[Tuple[str, Set, Set]]] = None, num_episode: int = 10, - relearn_ratio: int = 2, num_learning_problems=3): - """ - Train RL agent on learning problems with relearn_ratio. - - Args: - dataset: An iterable containing training data. Each item corresponds to a tuple of string representation - of target concept, a set of positive examples in the form of URIs amd a set of negative examples in - the form of URIs, respectively. - relearn_ratio: An integer indicating the number of times dataset is iterated. - - Computation: - 1. Dataset and relearn_ratio loops: Learn each problem relearn_ratio times. - - 2. Learning loop. + def train(self, dataset: Optional[Iterable[Tuple[str, Set, Set]]] = None, num_of_target_concepts: int = 3, + num_episode: int = 3, num_learning_problems: int = 3): + """ Train an RL agent on description logic concept learning problems """ - 3. Take post process action that implemented by subclass. - - Returns: - self. - """ if self.pre_trained_kge is None: return self.terminate_training() - dataset = self.generate_learning_problems(dataset, num_learning_problems) counter = 1 - renderer = DLSyntaxObjectRenderer() + for (target_owl_ce, positives, negatives) in self.generate_learning_problems(dataset, + num_of_target_concepts, + num_learning_problems): + print(f"Goal Concept:\t {target_owl_ce}\tE^+:[{len(positives)}]\t E^-:[{len(negatives)}]") + sum_of_rewards_per_actions = self.rl_learning_loop(num_episode=num_episode, pos_uri=positives, + neg_uri=negatives) + # print(f'Sum of Rewards in last 3 trajectories:{sum_of_rewards_per_actions[:3]}') + + self.seen_examples.setdefault(counter, dict()).update( + {'Concept': target_owl_ce, + 'Positives': [i.get_iri().as_str() for i in positives], + 'Negatives': [i.get_iri().as_str() for i in negatives]}) + counter += 1 + if counter % 100 == 0: + self.save_weights() + return self.terminate_training() - # 1. - for _ in range(relearn_ratio): - for (target_owl_ce, positives, negatives) in dataset: + def learn_from_illustration(self, sequence_of_goal_path: List[RL_State]): + """ + Args: + sequence_of_goal_path: ⊤,Parent,Parent ⊓ Daughter. + """ + current_state = sequence_of_goal_path.pop(0) + rewards = [] + sequence_of_states = [] + while len(sequence_of_goal_path) > 0: + self.assign_embeddings(current_state) + current_state.length = self.kb.concept_len(current_state.concept) + if current_state.quality is None: + self.compute_quality_of_class_expression(current_state) - print('Goal Concept:{0}\tE^+:[{1}] \t E^-:[{2}]'.format(target_owl_ce, - len(positives), len(negatives))) - print(f'RL training on {counter}.th learning problem {target_owl_ce} starts') + next_state = sequence_of_goal_path.pop(0) + self.assign_embeddings(next_state) + next_state.length = self.kb.concept_len(next_state.concept) + if next_state.quality is None: + self.compute_quality_of_class_expression(next_state) + sequence_of_states.append((current_state, next_state)) + rewards.append(self.reward_func.apply(current_state, next_state)) + for x in range(2): + self.form_experiences(sequence_of_states, rewards) + self.learn_from_replay_memory() + + def best_hypotheses(self, n=1): + assert self.search_tree is not None + assert len(self.search_tree) > 1 + if n == 1: + return [i for i in self.search_tree.get_top_n_nodes(n)][0] + else: + return [i for i in self.search_tree.get_top_n_nodes(n)] + + def clean(self): + self.emb_pos, self.emb_neg = None, None + self.goal_found = False + self.start_time = None + if len(self.search_tree) != 0: + self.search_tree.clean() - sum_of_rewards_per_actions = self.rl_learning_loop(num_episode=num_episode, pos_uri=positives, - neg_uri=negatives) + try: + assert len(self.search_tree) == 0 + except AssertionError: + print(len(self.search_tree)) + raise AssertionError('EMPTY search tree') - print(f'Sum of Rewards in first 3 trajectory:{sum_of_rewards_per_actions[:3]}') - print(f'Sum of Rewards in last 3 trajectory:{sum_of_rewards_per_actions[:3]}') + self._number_of_tested_concepts = 0 - self.seen_examples.setdefault(counter, dict()).update( - {'Concept': target_owl_ce, - 'Positives': [i.get_iri().as_str() for i in positives], - 'Negatives': [i.get_iri().as_str() for i in negatives]}) + def downward_refinement(self, *args, **kwargs): + ValueError('downward_refinement') - counter += 1 - if counter % 100 == 0: - self.save_weights() - # 3. - return self.terminate_training() + def next_node_to_expand(self) -> RL_State: + """ Return a node that maximizes the heuristic function at time t. """ + return self.search_tree.get_most_promising() class DrillHeuristic: From 5c1ff3bc1d2da442325fd50d6decaf9d2a37f452 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Fri, 8 Dec 2023 17:11:26 +0100 Subject: [PATCH 28/31] WIP: Tree based learner --- ontolearn/learners/__init__.py | 3 +- ontolearn/learners/tree_learner.py | 245 +++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 ontolearn/learners/tree_learner.py diff --git a/ontolearn/learners/__init__.py b/ontolearn/learners/__init__.py index 4cb52422..d02bb79d 100644 --- a/ontolearn/learners/__init__.py +++ b/ontolearn/learners/__init__.py @@ -1 +1,2 @@ -from .drill import Drill \ No newline at end of file +from .drill import Drill +from .tree_learner import TreeLearner diff --git a/ontolearn/learners/tree_learner.py b/ontolearn/learners/tree_learner.py new file mode 100644 index 00000000..4d6aacbf --- /dev/null +++ b/ontolearn/learners/tree_learner.py @@ -0,0 +1,245 @@ +import pandas as pd + +from ontolearn.knowledge_base import KnowledgeBase +from typing import Dict, Set, Tuple, List +from ontolearn.learning_problem import PosNegLPStandard +import collections +import matplotlib.pyplot as plt +from sklearn import tree + +from owlapy.model import OWLObjectSomeValuesFrom, OWLObjectPropertyExpression, OWLObjectSomeValuesFrom, OWLObjectAllValuesFrom, \ + OWLObjectIntersectionOf, OWLClassExpression, OWLNothing, OWLThing, OWLNaryBooleanClassExpression, \ + OWLObjectUnionOf, OWLClass, OWLObjectComplementOf, OWLObjectMaxCardinality, OWLObjectMinCardinality, \ + OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLObjectInverseOf, OWLDataProperty, \ + OWLDataHasValue, OWLObjectHasValue + + +def extract_cbd(dataframe) -> Dict[str, Set[Tuple[str, str]]]: + """ + Extract concise bounded description for each entity, where the entity is a subject entity. + Create a mapping from a node to out-going edges and connected nodes + :param dataframe: + :return: + """ + # Extract concise bounded description for each entity, where the entity is a subject entity. + data = dict() + for i in dataframe.values.tolist(): + subject_, predicate_, object_ = i + data.setdefault(subject_, set()).add((predicate_, object_)) + return data + + +def base_construct_second(cbd_entities: Dict[str, Set[Tuple[str, str]]], rows: List[str], + feature_names: List[Tuple[str, str]]): + """ + :param cbd_entities: concise bounded description for each entity, where the entity is a subject entity that is + mapped to a predict and an object entity + :param rows: Individuals + """ + assert cbd_entities is not None, "No cbd entities" + result = [] + for s in rows: + # (1) Initialize an empty row + row = [False for _ in feature_names] + for (p, o) in cbd_entities[s]: + idx = feature_names.index((p, o)) + # (2) Fill th row with nodes/object entities + assert row[idx] is False + row[idx] = True + result.append(row) + result = pd.DataFrame(data=result, index=rows, columns=feature_names, dtype="category") + # print(f"Constructed tabular shape: {result.shape}") + # print("Features/Columns:", result.columns.tolist()) + return result + + +def explain_inference(clf, X_test, features, only_shared): + reports = [] + n_nodes = clf.tree_.node_count + children_left = clf.tree_.children_left + children_right = clf.tree_.children_right + feature = clf.tree_.feature + threshold = clf.tree_.threshold + values = clf.tree_.value + # Positives + node_indicator = clf.decision_path(X_test) + leaf_id = clf.apply(X_test) + + if only_shared: + sample_ids = range(len(X_test)) + # boolean array indicating the nodes both samples go through + common_nodes = node_indicator.toarray()[sample_ids].sum(axis=0) == len(sample_ids) + # obtain node ids using position in array + common_node_id = np.arange(n_nodes)[common_nodes] + + print( + "The following samples {samples} share the node(s) {nodes} in the tree.".format( + samples=sample_ids, nodes=common_node_id + ) + ) + print("This is {prop}% of all nodes.".format(prop=100 * len(common_node_id) / n_nodes)) + return None + + for sample_id in range(len(X_test)): + # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id` + node_index = node_indicator.indices[ + node_indicator.indptr[sample_id]: node_indicator.indptr[sample_id + 1] + ] + # print("Rules used to predict sample {id}:\n".format(id=sample_id)) + decision_path = [] + for node_id in node_index: + # continue to the next node if it is a leaf node + if leaf_id[sample_id] == node_id: + continue + + # check if value of the split feature for sample 0 is below threshold + if X_test[sample_id, feature[node_id]] <= threshold[node_id]: + threshold_sign = "<=" + else: + threshold_sign = ">" + + # report = f"decision node {node_id} : ({features[feature[node_id]]} = {X_test[sample_id, feature[node_id]]}) {threshold_sign} {threshold[node_id]})" + decision_path.append({"decision_node": node_id, "feature": features[feature[node_id]], + "value": X_test[sample_id, feature[node_id]]}) + reports.append(decision_path) + return reports + + +class TreeLearner: + def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame): + assert isinstance(dataframe_triples, pd.DataFrame), "dataframe_triples must be a Pandas DataFrame" + assert isinstance(knowledge_base, KnowledgeBase), "knowledge_base must be a KnowledgeBase instance" + assert len( + dataframe_triples) > 0, f"length of the dataframe must be greater than 0:Currently {dataframe_triples.shape}" + self.knowledge_base = knowledge_base + self.owl_classes_dict = {c.get_iri().as_str(): c for c in self.knowledge_base.get_concepts()} + self.owl_object_property_dict = {p.get_iri().as_str(): p for p in self.knowledge_base.get_object_properties()} + self.owl_individuals = {i.get_iri().as_str(): i for i in self.knowledge_base.individuals()} + + self.dataframe_triples = dataframe_triples + # Remove some triples triples + self.dataframe_triples = self.dataframe_triples[ + ~((self.dataframe_triples["relation"] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type") & ( + (self.dataframe_triples["object"] == "http://www.w3.org/2002/07/owl#NamedIndividual") | ( + self.dataframe_triples["object"] == "http://www.w3.org/2002/07/owl#Thing") | ( + self.dataframe_triples["object"] == "Ontology")))] + + self.cbd_mapping: Dict[str, Set[Tuple[str, str]]] + self.cbd_mapping = extract_cbd(self.dataframe_triples) + + self.str_individuals = list({i.get_iri().as_str() for i in self.knowledge_base.individuals()}) + + self.cbd_mapping_entities = {k: v for k, v in self.cbd_mapping.items() if k in self.str_individuals} + + self.Xraw = base_construct_second(cbd_entities=self.cbd_mapping_entities, + rows=self.str_individuals, + feature_names=[("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", + c.get_iri().as_str()) for c in + self.knowledge_base.get_concepts()] + + [(r.get_iri().as_str(), i) for i in self.str_individuals for r + in self.knowledge_base.get_object_properties()]) + assert len(self.Xraw) == len(self.str_individuals), "Xraw must be equal to individuals" + self.clf = None + + def labeling(self, pos, neg, apply_dummy=True): + """ + + """ + # (5) Labeling: Label each row/node + # Drop "label" if exists + + self.Xraw.loc[:, "label"] = 0 # unknowns + self.Xraw.loc[pos, "label"] = 1 # positives + self.Xraw.loc[neg, "label"] = -1 # negatives + # (5.1) drop unknowns although unknowns provide info + X = self.Xraw[self.Xraw.label != 0] + + raw_features = X.columns.tolist() + raw_features.remove("label") + if apply_dummy: + X_train_sparse = pd.get_dummies(X[raw_features]) + else: + X_train_sparse = X[raw_features] + y_train_sparse = X.loc[:, "label"] + + print(f"Train data shape:{X_train_sparse.shape}") + return X_train_sparse, y_train_sparse + + def fit(self, lp: PosNegLPStandard, max_runtime=None): + pos = lp.pos + neg = lp.neg + + str_pos_examples = [i.get_iri().as_str() for i in lp.pos] + str_neg_examples = [i.get_iri().as_str() for i in lp.neg] + + X, y = self.labeling(pos=str_pos_examples, neg=str_neg_examples, apply_dummy=False) + # Binaries + self.clf = tree.DecisionTreeClassifier(max_depth=10, random_state=0).fit(X=X.values, y=y.values) + + # plt.figure(figsize=(30, 30)) + # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) + # plt.show() + + representation_of_positives = X.loc[str_pos_examples].values + concepts = [] + for sequence_of_reasoning_steps, single_positive_individuals in zip( + explain_inference(self.clf, X_test=representation_of_positives, features=X.columns.to_list(), + only_shared=False), str_pos_examples): + # print("Predicted as :", self.clf.predict(X.loc[single_positive_individuals].values.reshape(1, -1))) + + dl_concept = None + for step, reasoning_step in enumerate(sequence_of_reasoning_steps): + # tail can be individual or class + relation, tail = reasoning_step["feature"] + if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": + + if reasoning_step["value"] is True: + owl_class = self.owl_classes_dict[tail] + else: + owl_class = self.owl_classes_dict[tail].get_object_complement_of() + + assert single_positive_individuals in [i.get_iri().as_str() for i in + self.knowledge_base.individuals(owl_class)] + + else: + + if reasoning_step["value"] is True: + owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation],individual=self.owl_individuals[tail]) + + else: + owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], + individual=self.owl_individuals[tail]).get_object_complement_of() + + assert single_positive_individuals in [i.get_iri().as_str() for i in self.knowledge_base.individuals(owl_class)] + + if dl_concept is None: + dl_concept = owl_class + else: + dl_concept = OWLObjectIntersectionOf((dl_concept, owl_class)) + + """ + predicate, tail = step["feature"] + + if predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": + if step['value']: + decision.append(self.owl_classes_dict[tail]) + else: + decision.append(self.owl_classes_dict[tail].get_object_complement_of()) + else: + types = self.dataframe_triples[(self.dataframe_triples.subject == tail) & ( + self.dataframe_triples.relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")][ + "object"].tolist() + for type_tail in types: + exist_decision.append(self.knowledge_base.generator.existential_restriction( + property=self.owl_object_property_dict[predicate], filler=self.owl_classes_dict[type_tail])) + """ + + concepts.append(dl_concept) + + for i in concepts: + print(i) + # TODO: + # best_hypothesis = construct_description_logic_concept(trained_clf, df, X, pos_examples, neg_examples) + + exit(1) + pass From a475adfeeab173dde2a7312aeed11cdaf5c87ff5 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Fri, 8 Dec 2023 18:19:17 +0100 Subject: [PATCH 29/31] From decision tree, set of dl concepts are generated --- ontolearn/learners/tree_learner.py | 67 +++++++++++++++++------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/ontolearn/learners/tree_learner.py b/ontolearn/learners/tree_learner.py index 4d6aacbf..98339ce5 100644 --- a/ontolearn/learners/tree_learner.py +++ b/ontolearn/learners/tree_learner.py @@ -7,7 +7,8 @@ import matplotlib.pyplot as plt from sklearn import tree -from owlapy.model import OWLObjectSomeValuesFrom, OWLObjectPropertyExpression, OWLObjectSomeValuesFrom, OWLObjectAllValuesFrom, \ +from owlapy.model import OWLObjectSomeValuesFrom, OWLObjectPropertyExpression, OWLObjectSomeValuesFrom, \ + OWLObjectAllValuesFrom, \ OWLObjectIntersectionOf, OWLClassExpression, OWLNothing, OWLThing, OWLNaryBooleanClassExpression, \ OWLObjectUnionOf, OWLClass, OWLObjectComplementOf, OWLObjectMaxCardinality, OWLObjectMinCardinality, \ OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLObjectInverseOf, OWLDataProperty, \ @@ -106,11 +107,12 @@ def explain_inference(clf, X_test, features, only_shared): class TreeLearner: - def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame): + def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, quality_func): assert isinstance(dataframe_triples, pd.DataFrame), "dataframe_triples must be a Pandas DataFrame" assert isinstance(knowledge_base, KnowledgeBase), "knowledge_base must be a KnowledgeBase instance" assert len( dataframe_triples) > 0, f"length of the dataframe must be greater than 0:Currently {dataframe_triples.shape}" + self.quality_func = quality_func self.knowledge_base = knowledge_base self.owl_classes_dict = {c.get_iri().as_str(): c for c in self.knowledge_base.get_concepts()} self.owl_object_property_dict = {p.get_iri().as_str(): p for p in self.knowledge_base.get_object_properties()} @@ -152,7 +154,7 @@ def labeling(self, pos, neg, apply_dummy=True): self.Xraw.loc[pos, "label"] = 1 # positives self.Xraw.loc[neg, "label"] = -1 # negatives # (5.1) drop unknowns although unknowns provide info - X = self.Xraw[self.Xraw.label != 0] + X = self.Xraw # self.Xraw[self.Xraw.label != 0] raw_features = X.columns.tolist() raw_features.remove("label") @@ -165,10 +167,17 @@ def labeling(self, pos, neg, apply_dummy=True): print(f"Train data shape:{X_train_sparse.shape}") return X_train_sparse, y_train_sparse - def fit(self, lp: PosNegLPStandard, max_runtime=None): - pos = lp.pos - neg = lp.neg + def compute_quality(self, concept, pos, neg): + instances = set(self.knowledge_base.individuals(concept)) + tp = len(pos.intersection(instances)) + tn = len(neg.difference(instances)) + + fp = len(neg.intersection(instances)) + fn = len(pos.difference(instances)) + _, f1_score = self.quality_func.score2(tp=tp, fn=fn, fp=fp, tn=tn) + return f1_score + def fit(self, lp: PosNegLPStandard, max_runtime=None): str_pos_examples = [i.get_iri().as_str() for i in lp.pos] str_neg_examples = [i.get_iri().as_str() for i in lp.neg] @@ -179,21 +188,23 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): # plt.figure(figsize=(30, 30)) # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) # plt.show() - + from owlapy.render import DLSyntaxObjectRenderer + render=DLSyntaxObjectRenderer() representation_of_positives = X.loc[str_pos_examples].values - concepts = [] + dl_concepts = set() for sequence_of_reasoning_steps, single_positive_individuals in zip( explain_inference(self.clf, X_test=representation_of_positives, features=X.columns.to_list(), only_shared=False), str_pos_examples): - # print("Predicted as :", self.clf.predict(X.loc[single_positive_individuals].values.reshape(1, -1))) - - dl_concept = None + predicted_class=self.clf.predict(X.loc[single_positive_individuals].values.reshape(1, -1)) + assert predicted_class==1 for step, reasoning_step in enumerate(sequence_of_reasoning_steps): + # print(f"\t{reasoning_step}") # tail can be individual or class relation, tail = reasoning_step["feature"] + # from numpy.bool_ to python bool + value = bool(reasoning_step["value"]) if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": - - if reasoning_step["value"] is True: + if value: owl_class = self.owl_classes_dict[tail] else: owl_class = self.owl_classes_dict[tail].get_object_complement_of() @@ -201,21 +212,20 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): assert single_positive_individuals in [i.get_iri().as_str() for i in self.knowledge_base.individuals(owl_class)] + dl_concepts.add(owl_class) else: - if reasoning_step["value"] is True: - owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation],individual=self.owl_individuals[tail]) + if value: + dl_concepts.add(OWLObjectHasValue(property=self.owl_object_property_dict[relation], individual=self.owl_individuals[tail])) + for i in self.knowledge_base.get_types(self.owl_individuals[tail]): + dl_concepts.add(OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[relation], filler=i)) else: - owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], - individual=self.owl_individuals[tail]).get_object_complement_of() + continue + #owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], + # individual=self.owl_individuals[tail]).get_object_complement_of() - assert single_positive_individuals in [i.get_iri().as_str() for i in self.knowledge_base.individuals(owl_class)] - if dl_concept is None: - dl_concept = owl_class - else: - dl_concept = OWLObjectIntersectionOf((dl_concept, owl_class)) """ predicate, tail = step["feature"] @@ -233,13 +243,10 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): exist_decision.append(self.knowledge_base.generator.existential_restriction( property=self.owl_object_property_dict[predicate], filler=self.owl_classes_dict[type_tail])) """ - - concepts.append(dl_concept) - - for i in concepts: - print(i) - # TODO: - # best_hypothesis = construct_description_logic_concept(trained_clf, df, X, pos_examples, neg_examples) - + # @TODO: Do something with them ! + for i in dl_concepts: + print(render.render(i), end="\t") + f1_score = self.compute_quality(concept=i, pos=lp.pos, neg=lp.neg) + print(f1_score) exit(1) pass From cbb77e216341876bdeba6c29f22e22134f175b68 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Sun, 10 Dec 2023 10:00:47 +0100 Subject: [PATCH 30/31] A path in a decision tree can be mapped to an intersection of dl concepts --- ontolearn/learners/tree_learner.py | 171 ++++++++++++++++++----------- 1 file changed, 105 insertions(+), 66 deletions(-) diff --git a/ontolearn/learners/tree_learner.py b/ontolearn/learners/tree_learner.py index 98339ce5..fd7691e1 100644 --- a/ontolearn/learners/tree_learner.py +++ b/ontolearn/learners/tree_learner.py @@ -1,3 +1,4 @@ +import owlapy.model import pandas as pd from ontolearn.knowledge_base import KnowledgeBase @@ -5,14 +6,15 @@ from ontolearn.learning_problem import PosNegLPStandard import collections import matplotlib.pyplot as plt +import sklearn from sklearn import tree from owlapy.model import OWLObjectSomeValuesFrom, OWLObjectPropertyExpression, OWLObjectSomeValuesFrom, \ OWLObjectAllValuesFrom, \ OWLObjectIntersectionOf, OWLClassExpression, OWLNothing, OWLThing, OWLNaryBooleanClassExpression, \ OWLObjectUnionOf, OWLClass, OWLObjectComplementOf, OWLObjectMaxCardinality, OWLObjectMinCardinality, \ - OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLObjectInverseOf, OWLDataProperty, \ - OWLDataHasValue, OWLObjectHasValue + OWLDataSomeValuesFrom, OWLDatatypeRestriction, OWLLiteral, OWLDataHasValue, OWLObjectHasValue +from owlapy.render import DLSyntaxObjectRenderer def extract_cbd(dataframe) -> Dict[str, Set[Tuple[str, str]]]: @@ -167,86 +169,123 @@ def labeling(self, pos, neg, apply_dummy=True): print(f"Train data shape:{X_train_sparse.shape}") return X_train_sparse, y_train_sparse - def compute_quality(self, concept, pos, neg): - instances = set(self.knowledge_base.individuals(concept)) + def compute_quality(self, instances, pos, neg, conf_matrix=False): + assert isinstance(instances, set) tp = len(pos.intersection(instances)) tn = len(neg.difference(instances)) fp = len(neg.intersection(instances)) fn = len(pos.difference(instances)) + _, f1_score = self.quality_func.score2(tp=tp, fn=fn, fp=fp, tn=tn) + if conf_matrix: + return f1_score, f"TP:{tp}\tFN:{fn}\tFP:{fp}\tTN:{tn}" return f1_score + def union_and_intersect(self, filtered_hypothesis): + intersections_and_unions = set() + for c in filtered_hypothesis: + for other in filtered_hypothesis: + intersections_and_unions.add(OWLObjectIntersectionOf((c, other))) + intersections_and_unions.add(OWLObjectUnionOf((c, other))) + + return intersections_and_unions.union(filtered_hypothesis) + + def decision_to_owl_class_exp(self, reasoning_step: dict, single_positive_indv): + """ + + """ + # print(f"\t{reasoning_step}") + # tail can be individual or class + relation, tail = reasoning_step["feature"] + # from numpy.bool_ to python bool + value = bool(reasoning_step["value"]) + if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": + if value: + owl_class = self.owl_classes_dict[tail] + assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + else: + owl_class = self.owl_classes_dict[tail].get_object_complement_of() + assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + else: + owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], + individual=self.owl_individuals[tail]) + if value: + assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals( + owl_class) + else: + owl_class = owl_class.get_object_complement_of() + assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals( + owl_class) + return owl_class + + def cumulative_intersection_from_iterable(self, concepts): + result = None + for i in concepts: + if result is None: + result = i + else: + result = OWLObjectIntersectionOf((result, i)) + + return result + + def intersect_of_concepts(self, concepts): + dl_concept_path = None + for c in concepts: + if dl_concept_path is None: + dl_concept_path = c + else: + dl_concept_path = OWLObjectIntersectionOf((dl_concept_path, c)) + return dl_concept_path + def fit(self, lp: PosNegLPStandard, max_runtime=None): str_pos_examples = [i.get_iri().as_str() for i in lp.pos] str_neg_examples = [i.get_iri().as_str() for i in lp.neg] X, y = self.labeling(pos=str_pos_examples, neg=str_neg_examples, apply_dummy=False) # Binaries - self.clf = tree.DecisionTreeClassifier(max_depth=10, random_state=0).fit(X=X.values, y=y.values) - + self.clf = tree.DecisionTreeClassifier(random_state=0).fit(X=X.values, y=y.values) + print("Classification Report: Negatives: -1, Unknowns:0, Positives 1 ") + print(sklearn.metrics.classification_report(y.values, self.clf.predict(X.values), target_names=None)) # plt.figure(figsize=(30, 30)) # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) # plt.show() - from owlapy.render import DLSyntaxObjectRenderer - render=DLSyntaxObjectRenderer() - representation_of_positives = X.loc[str_pos_examples].values - dl_concepts = set() - for sequence_of_reasoning_steps, single_positive_individuals in zip( - explain_inference(self.clf, X_test=representation_of_positives, features=X.columns.to_list(), + + render = DLSyntaxObjectRenderer() + prediction_per_example = [] + + # () Iterate over E^+ + for sequence_of_reasoning_steps, pos in zip( + explain_inference(self.clf, + X_test=X.loc[str_pos_examples].values, + features=X.columns.to_list(), only_shared=False), str_pos_examples): - predicted_class=self.clf.predict(X.loc[single_positive_individuals].values.reshape(1, -1)) - assert predicted_class==1 - for step, reasoning_step in enumerate(sequence_of_reasoning_steps): - # print(f"\t{reasoning_step}") - # tail can be individual or class - relation, tail = reasoning_step["feature"] - # from numpy.bool_ to python bool - value = bool(reasoning_step["value"]) - if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": - if value: - owl_class = self.owl_classes_dict[tail] - else: - owl_class = self.owl_classes_dict[tail].get_object_complement_of() - - assert single_positive_individuals in [i.get_iri().as_str() for i in - self.knowledge_base.individuals(owl_class)] - - dl_concepts.add(owl_class) - else: - - if value: - dl_concepts.add(OWLObjectHasValue(property=self.owl_object_property_dict[relation], individual=self.owl_individuals[tail])) - for i in self.knowledge_base.get_types(self.owl_individuals[tail]): - dl_concepts.add(OWLObjectSomeValuesFrom(property=self.owl_object_property_dict[relation], filler=i)) - - else: - continue - #owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], - # individual=self.owl_individuals[tail]).get_object_complement_of() - - - - """ - predicate, tail = step["feature"] - - if predicate == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": - if step['value']: - decision.append(self.owl_classes_dict[tail]) - else: - decision.append(self.owl_classes_dict[tail].get_object_complement_of()) - else: - types = self.dataframe_triples[(self.dataframe_triples.subject == tail) & ( - self.dataframe_triples.relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")][ - "object"].tolist() - for type_tail in types: - exist_decision.append(self.knowledge_base.generator.existential_restriction( - property=self.owl_object_property_dict[predicate], filler=self.owl_classes_dict[type_tail])) - """ - # @TODO: Do something with them ! - for i in dl_concepts: - print(render.render(i), end="\t") - f1_score = self.compute_quality(concept=i, pos=lp.pos, neg=lp.neg) - print(f1_score) + # () Ensure that e \in E^+ is classified as positive + assert 1 == self.clf.predict(X.loc[pos].values.reshape(1, -1)) + # () Reasoning behind of the prediction of a single positive example. + + sequence_of_concept_path_of_tree = [self.decision_to_owl_class_exp(reasoning_step, pos) for + reasoning_step in + sequence_of_reasoning_steps] + pred = self.intersect_of_concepts(sequence_of_concept_path_of_tree) + # SANITY CHECKING: A path starting from root and ending in a leaf for a single positive example must be F1.=0 + assert self.compute_quality(instances={i for i in self.knowledge_base.individuals(pred)}, + pos={self.owl_individuals[pos]}, + neg=lp.neg) == 1.0 + prediction_per_example.append((pred, pos)) + + for dl_concept, str_pos_example in prediction_per_example: + print(f"A positive example:{str_pos_example}") + print(f"Path of DL concepts:{render.render(dl_concept)}") + individuals = {i for i in self.knowledge_base.individuals(dl_concept)} + f1_local = self.compute_quality(instances=individuals, + pos={self.owl_individuals[str_pos_example]}, + neg=lp.neg) + f1_global = self.compute_quality(instances=individuals, + pos=lp.pos, + neg=lp.neg) + + print(f"Local Quality:{f1_local}") + print(f"Global Quality:{f1_global}") + exit(1) - pass From 24dc5358adc5b3a145879c2b36464fb107e198e6 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Sun, 10 Dec 2023 10:59:51 +0100 Subject: [PATCH 31/31] Naive implementation of TreeLearner is completed --- examples/concept_learning_evaluation.py | 67 +++++++++++++++++++++---- ontolearn/learners/tree_learner.py | 59 +++++++++++++++------- 2 files changed, 98 insertions(+), 28 deletions(-) diff --git a/examples/concept_learning_evaluation.py b/examples/concept_learning_evaluation.py index 2a98f44f..9482bea6 100644 --- a/examples/concept_learning_evaluation.py +++ b/examples/concept_learning_evaluation.py @@ -4,30 +4,63 @@ import pandas as pd from ontolearn.knowledge_base import KnowledgeBase from ontolearn.concept_learner import CELOE, OCEL, EvoLearner -from ontolearn import Drill +from ontolearn.learners import Drill, TreeLearner from ontolearn.learning_problem import PosNegLPStandard from ontolearn.metrics import Accuracy, F1 from owlapy.model import OWLClass, OWLNamedIndividual, IRI import argparse +from rdflib import Graph pd.set_option("display.precision", 5) + +def compute_f1_score(individuals, pos, neg): + tp = len(pos.intersection(individuals)) + tn = len(neg.difference(individuals)) + + fp = len(neg.intersection(individuals)) + fn = len(pos.difference(individuals)) + + try: + recall = tp / (tp + fn) + except ZeroDivisionError: + return 0 + + try: + precision = tp / (tp + fp) + except ZeroDivisionError: + return 0 + + if precision == 0 or recall == 0: + return 0 + + f_1 = 2 * ((precision * recall) / (precision + recall)) + return f_1 + def dl_concept_learning(args): with open(args.lps) as json_file: settings = json.load(json_file) kb = KnowledgeBase(path=args.kb) + # Our ongoing work + dtl = TreeLearner(knowledge_base=kb, + dataframe_triples=pd.DataFrame( + data=[(str(s), str(p), str(o)) for s, p, o in Graph().parse(args.kb)], + columns=['subject', 'relation', 'object'], dtype=str), quality_func=F1(), + max_runtime=args.max_runtime) drill = Drill(knowledge_base=kb, path_pretrained_kge=args.path_pretrained_kge, quality_func=F1(), max_runtime=args.max_runtime).train(num_of_target_concepts=2, num_learning_problems=2) + ocel = OCEL(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) celoe = CELOE(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) evo = EvoLearner(knowledge_base=kb, quality_func=F1(), max_runtime=args.max_runtime) columns = ["LP", - "OCEL", "F1-OCEL", "RT-OCEL", - "CELOE", "F1-CELOE", "RT-CELOE", - "EvoLearner", "F1-EvoLearner", "RT-EvoLearner", - "DRILL", "F1-DRILL", "RT-DRILL"] + "F1-OCEL", "RT-OCEL", + "F1-CELOE", "RT-CELOE", + "F1-EvoLearner", "RT-EvoLearner", + "F1-DRILL", "RT-DRILL", + "F1-DTL", "RT-DTL"] values = [] for str_target_concept, examples in settings['problems'].items(): p = set(examples['positive_examples']) @@ -39,27 +72,41 @@ def dl_concept_learning(args): typed_neg = set(map(OWLNamedIndividual, map(IRI.create, n))) lp = PosNegLPStandard(pos=typed_pos, neg=typed_neg) + + start_time = time.time() + # Get best prediction + pred_dtl = dtl.fit(lp).best_hypotheses(n=1) + rt_dtl = time.time() - start_time + # Compute quality of best prediction + f1_dtl = compute_f1_score(individuals={i for i in kb.individuals(pred_dtl)}, pos=lp.pos, neg=lp.neg) + start_time = time.time() pred_drill = drill.fit(lp).best_hypotheses(n=1) rt_drill = time.time() - start_time + f1_drill = compute_f1_score(individuals={i for i in kb.individuals(pred_drill.concept)}, pos=lp.pos, neg=lp.neg) + start_time = time.time() pred_ocel = ocel.fit(lp).best_hypotheses(n=1) rt_ocel = time.time() - start_time + f1_ocel = compute_f1_score(individuals={i for i in kb.individuals(pred_ocel.concept)}, pos=lp.pos, neg=lp.neg) start_time = time.time() pred_celoe = celoe.fit(lp).best_hypotheses(n=1) rt_celoe = time.time() - start_time + f1_celoe = compute_f1_score(individuals={i for i in kb.individuals(pred_celoe.concept)}, pos=lp.pos, neg=lp.neg) start_time = time.time() pred_evo = evo.fit(lp).best_hypotheses(n=1) rt_evo = time.time() - start_time + f1_evo = compute_f1_score(individuals={i for i in kb.individuals(pred_evo.concept)}, pos=lp.pos, neg=lp.neg) values.append( [str_target_concept, - pred_ocel.str, pred_ocel.quality, rt_ocel, - pred_celoe.str, pred_celoe.quality, rt_celoe, - pred_evo.str, pred_evo.quality, rt_evo, - pred_drill.str, pred_drill.quality, rt_drill]) + f1_ocel, rt_ocel, + f1_celoe, rt_celoe, + f1_drill, rt_drill, + f1_evo, rt_evo, + f1_dtl, rt_dtl]) df = pd.DataFrame(values, columns=columns) print(df) @@ -69,7 +116,7 @@ def dl_concept_learning(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Description Logic Concept Learning') - parser.add_argument("--max_runtime", type=int, default=10) + parser.add_argument("--max_runtime", type=int, default=3) parser.add_argument("--lps", type=str, default="synthetic_problems.json") parser.add_argument("--kb", type=str, default="../KGs/Family/family-benchmark_rich_background.owl") parser.add_argument("--path_pretrained_kge", type=str, default=None) diff --git a/ontolearn/learners/tree_learner.py b/ontolearn/learners/tree_learner.py index fd7691e1..ed3c8e26 100644 --- a/ontolearn/learners/tree_learner.py +++ b/ontolearn/learners/tree_learner.py @@ -109,7 +109,7 @@ def explain_inference(clf, X_test, features, only_shared): class TreeLearner: - def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, quality_func): + def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, quality_func, max_runtime): assert isinstance(dataframe_triples, pd.DataFrame), "dataframe_triples must be a Pandas DataFrame" assert isinstance(knowledge_base, KnowledgeBase), "knowledge_base must be a KnowledgeBase instance" assert len( @@ -120,6 +120,7 @@ def __init__(self, knowledge_base, dataframe_triples: pd.DataFrame, quality_func self.owl_object_property_dict = {p.get_iri().as_str(): p for p in self.knowledge_base.get_object_properties()} self.owl_individuals = {i.get_iri().as_str(): i for i in self.knowledge_base.individuals()} + self.best_pred = None self.dataframe_triples = dataframe_triples # Remove some triples triples self.dataframe_triples = self.dataframe_triples[ @@ -166,7 +167,7 @@ def labeling(self, pos, neg, apply_dummy=True): X_train_sparse = X[raw_features] y_train_sparse = X.loc[:, "label"] - print(f"Train data shape:{X_train_sparse.shape}") + # print(f"Train data shape:{X_train_sparse.shape}") return X_train_sparse, y_train_sparse def compute_quality(self, instances, pos, neg, conf_matrix=False): @@ -203,20 +204,20 @@ def decision_to_owl_class_exp(self, reasoning_step: dict, single_positive_indv): if relation == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type": if value: owl_class = self.owl_classes_dict[tail] - assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) else: owl_class = self.owl_classes_dict[tail].get_object_complement_of() - assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) else: owl_class = OWLObjectHasValue(property=self.owl_object_property_dict[relation], individual=self.owl_individuals[tail]) if value: - assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals( - owl_class) + pass + # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) else: owl_class = owl_class.get_object_complement_of() - assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals( - owl_class) + # assert self.owl_individuals[single_positive_indv] in self.knowledge_base.individuals(owl_class) + return owl_class def cumulative_intersection_from_iterable(self, concepts): @@ -238,6 +239,19 @@ def intersect_of_concepts(self, concepts): dl_concept_path = OWLObjectIntersectionOf((dl_concept_path, c)) return dl_concept_path + def union_of_concepts(self, concepts): + dl_concept_path = None + for c in concepts: + if dl_concept_path is None: + dl_concept_path = c + else: + dl_concept_path = OWLObjectUnionOf((dl_concept_path, c)) + return dl_concept_path + + def best_hypotheses(self, n=1): + assert n == 1 + return self.best_pred + def fit(self, lp: PosNegLPStandard, max_runtime=None): str_pos_examples = [i.get_iri().as_str() for i in lp.pos] str_neg_examples = [i.get_iri().as_str() for i in lp.neg] @@ -245,8 +259,8 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): X, y = self.labeling(pos=str_pos_examples, neg=str_neg_examples, apply_dummy=False) # Binaries self.clf = tree.DecisionTreeClassifier(random_state=0).fit(X=X.values, y=y.values) - print("Classification Report: Negatives: -1, Unknowns:0, Positives 1 ") - print(sklearn.metrics.classification_report(y.values, self.clf.predict(X.values), target_names=None)) + # print("Classification Report: Negatives: -1, Unknowns:0, Positives 1 ") + # print(sklearn.metrics.classification_report(y.values, self.clf.predict(X.values), target_names=None)) # plt.figure(figsize=(30, 30)) # tree.plot_tree(self.clf, fontsize=10, feature_names=X.columns.to_list()) # plt.show() @@ -269,14 +283,20 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): sequence_of_reasoning_steps] pred = self.intersect_of_concepts(sequence_of_concept_path_of_tree) # SANITY CHECKING: A path starting from root and ending in a leaf for a single positive example must be F1.=0 - assert self.compute_quality(instances={i for i in self.knowledge_base.individuals(pred)}, - pos={self.owl_individuals[pos]}, - neg=lp.neg) == 1.0 + # assert self.compute_quality(instances={i for i in self.knowledge_base.individuals(pred)}, + # pos={self.owl_individuals[pos]}, + # neg=lp.neg) == 1.0 prediction_per_example.append((pred, pos)) + self.best_pred = self.union_of_concepts([pred for pred, pos in prediction_per_example]) + """ + # print(f"Union Of paths of DL concepts:{render.render(final_pred)}") + # individuals_final_pred = {i for i in self.knowledge_base.individuals(final_pred)} + + for dl_concept, str_pos_example in prediction_per_example: - print(f"A positive example:{str_pos_example}") - print(f"Path of DL concepts:{render.render(dl_concept)}") + # print(f"A positive example:{str_pos_example}") + # print(f"Path of DL concepts:{render.render(dl_concept)}") individuals = {i for i in self.knowledge_base.individuals(dl_concept)} f1_local = self.compute_quality(instances=individuals, pos={self.owl_individuals[str_pos_example]}, @@ -285,7 +305,10 @@ def fit(self, lp: PosNegLPStandard, max_runtime=None): pos=lp.pos, neg=lp.neg) - print(f"Local Quality:{f1_local}") - print(f"Global Quality:{f1_global}") + # print(f"Local Quality:{f1_local}") + # print(f"Global Quality:{f1_global}") + + # print(f"Global Quality of Final :{self.compute_quality(instances=individuals_final_pred, pos=lp.pos, neg=lp.neg)}") + """ - exit(1) + return self