diff --git a/.gitignore b/.gitignore index 8d35cb3..1d42d5e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,70 @@ -__pycache__ -*.pyc +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so +*.c + +# logs +runs/ +checkpoints/ + +#other +.DS_Store + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +#Ipython Notebook +.ipynb_checkpoints diff --git a/README.md b/README.md index 2c55182..f6adccb 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,12 @@ Install most recent nightly build (version '0.1.10+2fd4d08' or later) of PyTorch pip install git+https://github.com/pytorch/pytorch ` +## Dependencies + * pytorch + * torchvision + * universe (for now) + * [tensorboard logger](https://github.com/TeamHG-Memex/tensorboard_logger) + ## Results With 16 processes it converges for PongDeterministic-v3 in 15 minutes. diff --git a/main.py b/main.py index 2b29960..c3b44b9 100644 --- a/main.py +++ b/main.py @@ -4,18 +4,23 @@ import os import sys import math +import time import torch import torch.optim as optim import torch.multiprocessing as mp import torch.nn as nn import torch.nn.functional as F +import tensorboard_logger as tb + +import my_optim from envs import create_atari_env from model import ActorCritic from train import train from test import test from utils import logger -import my_optim +from utils.shared_memory import SharedCounter + logger = logger.getLogger('main') @@ -41,16 +46,27 @@ help='environment to train on (default: PongDeterministic-v3)') parser.add_argument('--no-shared', default=False, metavar='O', help='use an optimizer without shared momentum.') -parser.add_argument('--max-iters', type=int, default=math.inf, - help='maximum iterations per process.') - +parser.add_argument('--max-episode-count', type=int, default=math.inf, + help='maximum number of episodes to run per process.') parser.add_argument('--debug', action='store_true', default=False, help='run in a way its easier to debug') +parser.add_argument('--short-description', default='no_descr', + help='Short description of the run params, (used in tensorboard)') + +def setup_loggings(args): + logger.debug('CONFIGURATION: {}'.format(args)) + + cur_path = os.path.dirname(os.path.realpath(__file__)) + args.summ_base_dir = (cur_path+'/runs/{}/{}({})').format(args.env_name, + time.strftime('%d.%m-%H.%M'), args.short_description) + logger.info('logging run logs to {}'.format(args.summ_base_dir)) + tb.configure(args.summ_base_dir) if __name__ == '__main__': args = parser.parse_args() - + setup_loggings(args) torch.manual_seed(args.seed) + env = create_atari_env(args.env_name) shared_model = ActorCritic( env.observation_space.shape[0], env.action_space) @@ -61,20 +77,23 @@ else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() - + + gl_step_cnt = SharedCounter() if not args.debug: processes = [] - p = mp.Process(target=test, args=(args.num_processes, args, shared_model)) + p = mp.Process(target=test, args=(args.num_processes, args, + shared_model, gl_step_cnt)) p.start() processes.append(p) for rank in range(0, args.num_processes): - p = mp.Process(target=train, args=(rank, args, shared_model, optimizer)) + p = mp.Process(target=train, args=(rank, args, shared_model, + gl_step_cnt, optimizer)) p.start() processes.append(p) for p in processes: p.join() else: ## debug is enabled # run only one process in a main, easier to debug - train(0, args, shared_model, optimizer) + train(0, args, shared_model, gl_step_cnt, optimizer) diff --git a/model.py b/model.py index 5328d47..89cf377 100644 --- a/model.py +++ b/model.py @@ -44,12 +44,8 @@ def __init__(self, num_inputs, action_space): self.lstm = nn.LSTMCell(32 * 3 * 3, 256) num_outputs = action_space.n - self.critic_linear = nn.Linear(256, 1) self.actor_linear = nn.Linear(256, num_outputs) - #self.critic_linear = nn.Linear(288, 1) - #self.actor_linear = nn.Linear(288, num_outputs) - self.apply(weights_init) self.actor_linear.weight.data = normalized_columns_initializer( self.actor_linear.weight.data, 0.01) diff --git a/test.py b/test.py index d08be60..103f270 100644 --- a/test.py +++ b/test.py @@ -1,21 +1,23 @@ import math import os import sys +import time import torch import torch.nn.functional as F import torch.optim as optim +import tensorboard_logger as tb + from envs import create_atari_env from model import ActorCritic from torch.autograd import Variable from torchvision import datasets, transforms -import time from collections import deque from utils import logger logger = logger.getLogger('test') -def test(rank, args, shared_model): +def test(rank, args, shared_model, gl_step_cnt): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) @@ -32,6 +34,8 @@ def test(rank, args, shared_model): start_time = time.time() + local_episode_num = 0 + # a quick hack to prevent the agent from stucking actions = deque(maxlen=100) episode_length = 0 @@ -61,10 +65,17 @@ def test(rank, args, shared_model): done = True if done: + passed_time = time.time() - start_time + local_episode_num += 1 + global_step_count = gl_step_cnt.get_value() + logger.info("Time {}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", - time.gmtime(time.time() - start_time)), + time.gmtime(passed_time)), reward_sum, episode_length)) + tb.log_value('steps_second', global_step_count / passed_time, global_step_count) + tb.log_value('reward', reward_sum, global_step_count) + reward_sum = 0 episode_length = 0 actions.clear() diff --git a/train.py b/train.py index 0c2e864..b0f90ff 100644 --- a/train.py +++ b/train.py @@ -1,12 +1,11 @@ import math import os import sys -import resource -import gc import torch import torch.nn.functional as F import torch.optim as optim + from envs import create_atari_env from model import ActorCritic from torch.autograd import Variable @@ -21,7 +20,7 @@ def ensure_shared_grads(model, shared_model): return shared_param._grad = param.grad -def train(rank, args, shared_model, optimizer=None): +def train(rank, args, shared_model, gl_step_count, optimizer=None): torch.manual_seed(args.seed + rank) env = create_atari_env(args.env_name) @@ -39,8 +38,7 @@ def train(rank, args, shared_model, optimizer=None): done = True episode_length = 0 - - iteration = 0 + episode_count = 0 while True: @@ -49,17 +47,11 @@ def train(rank, args, shared_model, optimizer=None): rewards = [] entropies = [] - if iteration == args.max_iters: - logger.info('Max iteration {} reached..'.format(args.max_iters)) + if episode_count == args.max_episode_count: + logger.info('Maxiumum episode count {} reached..'.format(args.max_episode_count)) + # TODO make sure if no train process is running test.py closes as well break - if iteration % 200 == 0 and rank == 0: - mem_used = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) - mem_used_mb = mem_used / 1024 - logger.info('Memory usage of one proc: {} (mb)'.format(mem_used_mb)) - - - iteration += 1 episode_length += 1 # Sync with the shared model @@ -89,6 +81,7 @@ def train(rank, args, shared_model, optimizer=None): if done: episode_length = 0 + episode_count += 1 state = env.reset() state = torch.from_numpy(state) @@ -99,6 +92,9 @@ def train(rank, args, shared_model, optimizer=None): if done: break + # increment global step count + gl_step_count.increment_by(step) + R = torch.zeros(1, 1) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) diff --git a/utils/shared_memory.py b/utils/shared_memory.py new file mode 100644 index 0000000..221a70a --- /dev/null +++ b/utils/shared_memory.py @@ -0,0 +1,13 @@ +from multiprocessing import Value, Lock + +class SharedCounter: + def __init__(self): + self.lock = Lock() + self.n = Value('i', 0) + + def increment_by(self, k): + with self.lock: + self.n.value += k + + def get_value(self): + return self.n.value