test v0.0.9.3

andyjslee · andyjslee · commit 203b03ec2f43 · 2023-07-24T11:02:58.000-04:00
diff --git a/examples/test.sh b/examples/test.sh
@@ -0,0 +1,49 @@
+#ace generate \
+#  --num-peptides 120 \
+#  --num-peptides-per-pool 12 \
+#  --num-coverage 3 \
+#  --num-processes 6 \
+#  --mode golfy \
+#  --output-excel-file test_golfy.xlsx \
+#  --assign-well-ids 1 \
+#  --plate-type 96-well_plate
+
+ace generate \
+  --num-peptides 120 \
+  --num-peptides-per-pool 12 \
+  --num-coverage 3 \
+  --num-processes 6 \
+  --mode golfy \
+  --golfy-allow-extra-pools False \
+  --output-excel-file test_golfy.xlsx \
+  --assign-well-ids 1 \
+  --plate-type 96-well_plate
+
+echo ""
+
+ace generate \
+  --num-peptides 120 \
+  --num-peptides-per-pool 12 \
+  --num-coverage 3 \
+  --num-processes 6 \
+  --mode sat_solver \
+  --output-excel-file test_sat.xlsx \
+  --shuffle-iters 1000 \
+  --max-peptides-per-block 100 \
+  --max-peptides-per-pool 10 \
+  --assign-well-ids 1 \
+  --plate-type 96-well_plate
+
+#ace generate \
+#  --num-peptides 100 \
+#  --num-peptides-per-pool 5 \
+#  --num-coverage 3 \
+#  --num-processes 6 \
+#  --mode sat_solver \
+#  --output-excel-file test.xlsx \
+#  --shuffle-iters 0 \
+#  --max-peptides-per-block 100 \
+#  --max-peptides-per-pool 10 \
+#  --assign-well-ids 1 \
+#  --plate-type 96-well_plate
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ requires = [
 
 [project]
 name = "ace"
-version = "0.0.9.0"
+version = "0.0.9.3"
 requires-python = ">=3.7"
 keywords = [
     "elispot",
diff --git a/src/acelib/block_assignment.py b/src/acelib/block_assignment.py
@@ -87,27 +87,31 @@ def add_peptide(
             self.assignments[coverage][pool] = []
         self.assignments[coverage][pool].append((peptide_id, peptide_sequence))
 
-    def count_violations(self) -> int:
+    def num_violations(self) -> float:
         """
-        Counts the number of violations (i.e. number of peptides with
-        non-unique pool assignment).
+        Number of violations
+        (i.e. number of times two peptides appear together more than once).
 
         Returns
         -------
         num_violations      :   Number of violations.
         """
+        # Step 1. Create a dictionary of peptides and pools
         df_assignments = self.to_dataframe()
-        pool_ids_peptides_dict = defaultdict(list)
-        for peptide_id in list(df_assignments['peptide_id'].unique()):
-            pool_ids = list(df_assignments.loc[df_assignments['peptide_id'] == peptide_id, 'pool_id'].unique())
-            pool_ids = sorted(pool_ids)
-            pool_ids_peptides_dict[','.join([str(i) for i in pool_ids])].append(peptide_id)
+        peptide_pool_dict = defaultdict(list)
+        peptide_ids = list(df_assignments['peptide_id'].unique())
+        for peptide_id in peptide_ids:
+            peptide_pool_dict[peptide_id] = list(df_assignments.loc[df_assignments['peptide_id'] == peptide_id, 'pool_id'].unique())
 
+        # Step 2. Enumerate the number of violations
         num_violations = 0
-        for key, value in pool_ids_peptides_dict.items():
-            if len(value) > 1:
-                for peptide_id in value:
-                    num_violations += 1
+        for i in range(0, len(peptide_ids)):
+            for j in range(i + 1, len(peptide_ids)):
+                p1_pools = peptide_pool_dict[peptide_ids[i]]
+                p2_pools = peptide_pool_dict[peptide_ids[j]]
+                shared_pools = set(p1_pools).intersection(set(p2_pools))
+                if len(shared_pools) > 1:
+                    num_violations += len(shared_pools) - 1
         return num_violations
 
     def to_dataframe(self) -> pd.DataFrame:
@@ -144,7 +148,8 @@ def is_optimal(
         Verifies whether a given ELISpot assignment satisfies the following constraints:
         1. Each peptide is in 'num_coverage' number of different pools.
         2. Each peptide is in exactly one unique combination of pool IDs.
-        3. There is an optimal (minimal) number of pools.
+        3. Two peptides are not pooled together more than once.
+        4. There is an optimal (minimal) number of pools.
 
         Parameters
         ---------
@@ -182,28 +187,41 @@ def is_optimal(
         for key, value in pool_ids_peptides_dict.items():
             if len(value) > 1:
                 if verbose:
-                    logger.info("Assignment does not meet constraint #2. Pools %s have the following peptides:" % key)
-                    for peptide_id in value:
-                        logger.info(peptide_id)
+                    if constraint_2_bool:
+                        logger.info("Assignment does not meet constraint #2: there are peptides that do not belong to exactly one unique combination of pool IDs.")
+                    logger.info("\tPools %s have the following peptides: %s." % (key, ','.join(value)))
                 constraint_2_bool = False
         if constraint_2_bool:
             if verbose:
                 logger.info('Assignment meets constraint #2: each peptide belongs to exactly one unique combination of pool IDs.')
 
-        # Step 3. Check that there is an optimal number of pools
+        # Step 3. Two peptides are not pooled together more than once.
         constraint_3_bool = True
+        num_violations = self.num_violations()
+        if num_violations > 0:
+            constraint_3_bool = False
+            if verbose:
+                logger.info("Assignment does not meet constraint #3: violation score is %i "
+                            "(proxy of number of times peptide pairs are pooled together more than once)." %
+                            num_violations)
+        else:
+            if verbose:
+                logger.info('Assignment meets constraint #3: every pair of peptides is pooled together at most once.')
+
+        # Step 4. Check that there is an optimal number of pools
+        constraint_4_bool = True
         num_pools = math.ceil(len(df_assignments['peptide_id'].unique()) / num_peptides_per_pool) * num_coverage
         if len(df_assignments['pool_id'].unique()) != num_pools:
             num_extra_pools = len(df_assignments['pool_id'].unique()) - num_pools
             if verbose:
-                logger.info('Assignment does not meet constraint #3: %i extra pool(s) than the minimum possible number of pools (%i).' %
+                logger.info('Assignment does not meet constraint #4: %i extra pool(s) than the minimum possible number of pools (%i).' %
                             (num_extra_pools, num_pools))
-            constraint_3_bool = False
-        if constraint_3_bool:
+            constraint_4_bool = False
+        if constraint_4_bool:
             if verbose:
-                logger.info('Assignment meets constraint #3: there is an optimal (minimal) number of pools (%i).' % num_pools)
+                logger.info('Assignment meets constraint #4: there is an optimal (minimal) number of pools (%i).' % num_pools)
 
-        return constraint_1_bool & constraint_2_bool & constraint_3_bool
+        return constraint_1_bool & constraint_2_bool & constraint_3_bool & constraint_4_bool
 
     def shuffle_pool_ids(self):
         """
@@ -552,7 +570,7 @@ def minimize_violations(
             verbose: bool = True
     ) -> List['BlockAssignment']:
         """
-        Minimizes violations (i.e. non-unique pool assignment) in a list of
+        Minimizes violations (i.e. number of times peptide pairs are pooled together more than once) in a list of
         block assignments by shuffling pool IDs.
 
         Parameters
@@ -564,16 +582,16 @@ def minimize_violations(
         -------
         block_assignments   :   List of BlockAssignment objects.
         """
-        min_violations = BlockAssignment.merge(block_assignments=block_assignments).count_violations()
+        min_violations = BlockAssignment.merge(block_assignments=block_assignments).num_violations()
         curr_block_assignments = copy.deepcopy(block_assignments)
         best_block_assignments = copy.deepcopy(block_assignments)
         for _ in range(0, shuffle_iters):
             random_idx = random.choice(list(range(0, len(curr_block_assignments))))
             curr_block_assignments[random_idx].shuffle_pool_ids()
-            curr_num_violations = BlockAssignment.merge(block_assignments=curr_block_assignments).count_violations()
+            curr_num_violations = BlockAssignment.merge(block_assignments=curr_block_assignments).num_violations()
             if curr_num_violations < min_violations:
                 if verbose:
-                    logger.info('Found a better assignment: current number of violations: %i, new number of violations: %i' %
+                    logger.info('\tFound a better assignment; current number of violations: %i, new number of violations: %i' %
                                 (min_violations, curr_num_violations))
                 best_block_assignments = copy.deepcopy(curr_block_assignments)
                 min_violations = curr_num_violations
diff --git a/src/acelib/block_design.py b/src/acelib/block_design.py
@@ -377,7 +377,7 @@ def divide_block_design(
                 peptides = block_design.peptides[start_peptide_idx:end_peptide_idx + 1]
                 start_peptide_idx = end_peptide_idx + 1
                 if verbose:
-                    logger.info('\t\tAppending block design for %i peptides, %i peptides per pool' %
+                    logger.info('\tAppending block design for %i peptides, %i peptides per pool' %
                                 (len(peptides), num_peptides_per_pool))
                 block_design_ = BlockDesign(
                     peptides=peptides,
diff --git a/src/acelib/cli/cli_generate.py b/src/acelib/cli/cli_generate.py
@@ -17,6 +17,7 @@
 """
 
 
+import argparse
 import math
 import pandas as pd
 import os
@@ -171,6 +172,15 @@ def add_ace_generate_arg_parser(sub_parsers):
         required=False,
         help="Initialization mode for golfy (default: %s)." % GENERATE_GOLFY_INIT_MODE
     )
+    parser_optional_golfy.add_argument(
+        "--golfy-allow-extra-pools",
+        dest="golfy_allow_extra_pools",
+        type=eval,
+        default=GENERATE_GOLFY_ALLOW_EXTRA_POOLS,
+        choices=[True, False],
+        required=False,
+        help="Allow extra pools for golfy (default: %r)." % GENERATE_GOLFY_ALLOW_EXTRA_POOLS
+    )
 
     parser_optional_sat_solver = parser.add_argument_group("optional arguments (applies when '--mode sat_solver')")
     parser_optional_sat_solver.add_argument(
@@ -254,6 +264,7 @@ def run_ace_generate_from_parsed_args(args):
                 random_seed
                 golfy_max_iters
                 golfy_init_mode
+                golfy_allow_extra_pools
                 num_processes
                 shuffle_iters
                 max_peptides_per_block
@@ -320,13 +331,15 @@ def run_ace_generate_from_parsed_args(args):
             random_seed=args.random_seed,
             max_iters=args.golfy_max_iters,
             init_mode=args.golfy_init_mode,
+            allow_extra_pools=args.golfy_allow_extra_pools,
             verbose=args.verbose
         )
     elif args.mode == GenerateModes.SAT_SOLVER:
         block_assignment = run_ace_sat_solver(
             block_design=block_design,
             max_peptides_per_pool=args.max_peptides_per_pool,
             num_processes=args.num_processes,
+            shuffle_iters=args.shuffle_iters,
             verbose=args.verbose
         )
     else:
diff --git a/src/acelib/default_parameters.py b/src/acelib/default_parameters.py
@@ -19,11 +19,12 @@
 """generate"""
 # Number of processes.
 GENERATE_NUM_PROCESSES = 4
-GENERATE_GOLFY_MAX_ITERS = 2000
 GENERATE_RANDOM_SEED = 42
+GENERATE_GOLFY_MAX_ITERS = 2000
 GENERATE_GOLFY_INIT_MODE = 'greedy'
+GENERATE_GOLFY_ALLOW_EXTRA_POOLS = True
 GENERATE_SEQUENCE_SIMILARITY_THRESHOLD = 0.7
 GENERATE_SEQUENCE_SIMILARITY_FUNCTION = 'euclidean'
-GENERATE_SHUFFLE_ITERS = 100
+GENERATE_SHUFFLE_ITERS = 1000
 GENERATE_MAX_PEPTIDES_PER_BLOCK = 100
 GENERATE_MAX_PEPTIDES_PER_POOL = 10
diff --git a/src/acelib/main.py b/src/acelib/main.py
@@ -42,6 +42,7 @@ def run_ace_golfy(
         random_seed: int = GENERATE_RANDOM_SEED,
         max_iters: int = GENERATE_GOLFY_MAX_ITERS,
         init_mode: str = GENERATE_GOLFY_INIT_MODE,
+        allow_extra_pools: bool = GENERATE_GOLFY_ALLOW_EXTRA_POOLS,
         verbose: bool = True
 ) -> BlockAssignment:
     """
@@ -53,6 +54,7 @@ def run_ace_golfy(
     random_seed         :   Random seed.
     max_iters           :   Number of maximum iterations for golfy.
     init_mode           :   Init mode.
+    allow_extra_pools   :   Allow extra pools.
     verbose             :   If True, prints messages.
 
     Returns
@@ -81,9 +83,15 @@ def run_ace_golfy(
         num_replicates=block_design.num_coverage,
         strategy=init_mode,
         preferred_neighbors=preferred_neighbors,
+        allow_extra_pools=allow_extra_pools,
+        verbose=verbose
+    )
+    optimize(
+        golfy_solution,
+        max_iters=max_iters,
+        allow_extra_pools=allow_extra_pools,
         verbose=verbose
     )
-    optimize(golfy_solution, max_iters=max_iters, verbose=verbose)
 
     if verbose:
         logger.info('Finished running golfy.')
@@ -190,16 +198,19 @@ def run_ace_sat_solver(
             block_assignments.append(block_assignment)
 
     # Step 4. Merge assignments
+    logger.info('Started minimizing violations.')
     block_assignments = BlockAssignment.minimize_violations(
         block_assignments=block_assignments,
         shuffle_iters=shuffle_iters,
         verbose=verbose
     )
+    logger.info('Finished minimizing violations.')
     block_assignment = BlockAssignment.merge(block_assignments=block_assignments)
 
     if verbose:
         logger.info('Finished running SAT solver.')
-        logger.info('The returning block assignment has %i pools in total.' % block_assignment.num_pools)
-        logger.info('The returning block assignment has %i peptides in total.' % len(block_assignment.peptide_ids))
+        logger.info('The returning block assignment has the following:')
+        logger.info('\t%i pools in total.' % block_assignment.num_pools)
+        logger.info('\t%i peptides in total.' % len(block_assignment.peptide_ids))
     return block_assignment
 
diff --git a/src/acelib/sequence_features.py b/src/acelib/sequence_features.py
@@ -123,7 +123,6 @@ def forward(self, inputs, representation='last_hidden_state'):
 
     def load_weights(self, weights_path):
         """Load weights from a file"""
-        logger.info(self.device)
         self.load_state_dict(torch.load(weights_path, map_location=self.device))
 
     def save_weights(self, weights_path):