diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a7f0721851686686ec8819606c4d14297ff61bc4..81372257bfb43a079f6c18d4889e850871f7fabd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,7 +26,7 @@ tests:
         - apt install -y libgl1-mesa-glx xvfb
         - pip install tox
         - apt install -y graphviz xdg-utils
-        - xvfb-run -s "-screen 0 800x600x24" tox
+        - xvfb-run tox -v --recreate
 
 build_and_deploy_docs:
     image: "python:latest"
@@ -42,7 +42,7 @@ build_and_deploy_docs:
     script:
         - pip install -r requirements_dev.txt
         - python setup.py install
-        - make docs
+        - xvfb-run make docs
         - aws s3 cp ./docs/_build/html/ s3://${BUCKET_NAME} --recursive
     environment:
         name: ${CI_COMMIT_REF_SLUG}
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 7ae26bcc3d4e0f4a5cbbcfafd055e2c084a42345..65971d323bb11cd7449be54f22898bedfa6e4b0d 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -79,7 +79,7 @@ Ready to contribute? Here's how to set up `flatland` for local development.
 5. When you're done making changes, check that your changes pass flake8 and the
    tests, including testing other Python versions with tox::
 
-    $ flake8 flatland tests examples
+    $ flake8 flatland tests examples benchmarks
     $ python setup.py test or py.test
     $ tox
 
diff --git a/Makefile b/Makefile
index 69ad1b42fd51ef9ec9420f5473dc8acef5468572..98dcbb47a03ad7125694e5053f5e973e45b4fba4 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ clean-test: ## remove test and coverage artifacts
 	rm -fr .pytest_cache
 
 lint: ## check style with flake8
-	flake8 flatland tests examples
+	flake8 flatland tests examples benchmarks
 
 test: ## run tests quickly with the default Python
 	echo "$$DISPLAY"
@@ -61,7 +61,7 @@ test-all: ## run tests on every Python version with tox
 	tox
 
 coverage: ## check code coverage quickly with the default Python
-	xvfb-run -a coverage run --source flatland -m pytest
+	coverage run --source flatland -m pytest
 	coverage report -m
 	coverage html
 	$(BROWSER) htmlcov/index.html
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/benchmarks/play_model_benchmark.py b/benchmarks/play_model_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c087e8ab8ed33a24bd447cd3203b85eb1a28f2
--- /dev/null
+++ b/benchmarks/play_model_benchmark.py
@@ -0,0 +1,132 @@
+import random
+import time
+from collections import deque
+
+import numpy as np
+from benchmarker import Benchmarker
+
+from flatland.envs.generators import complex_rail_generator
+from flatland.envs.rail_env import RailEnv
+from flatland.utils.rendertools import RenderTool
+
+
+def main(render=True, delay=0.0):
+    random.seed(1)
+    np.random.seed(1)
+
+    # Example generate a random rail
+    env = RailEnv(width=15, height=15,
+                  rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=20, min_dist=12),
+                  number_of_agents=5)
+
+    if render:
+        env_renderer = RenderTool(env, gl="QTSVG")
+
+    n_trials = 20
+    eps = 1.
+    eps_end = 0.005
+    eps_decay = 0.998
+    action_dict = dict()
+    scores_window = deque(maxlen=100)
+    done_window = deque(maxlen=100)
+    scores = []
+    dones_list = []
+    action_prob = [0] * 4
+
+    def max_lt(seq, val):
+        """
+        Return greatest item in seq for which item < val applies.
+        None is returned if seq was empty or all items in seq were >= val.
+        """
+
+        idx = len(seq) - 1
+        while idx >= 0:
+            if seq[idx] < val and seq[idx] >= 0:
+                return seq[idx]
+            idx -= 1
+        return None
+
+    iFrame = 0
+    tStart = time.time()
+    for trials in range(1, n_trials + 1):
+
+        # Reset environment
+        obs = env.reset()
+        if render:
+            env_renderer.set_new_rail()
+
+        for a in range(env.get_num_agents()):
+            norm = max(1, max_lt(obs[a], np.inf))
+            obs[a] = np.clip(np.array(obs[a]) / norm, -1, 1)
+
+        # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
+
+        score = 0
+        env_done = 0
+
+        # Run episode
+        for step in range(100):
+            # if trials > 114:
+            # env_renderer.renderEnv(show=True)
+            # print(step)
+            # Action
+            for a in range(env.get_num_agents()):
+                action = np.random.randint(0, 4)
+                action_prob[action] += 1
+                action_dict.update({a: action})
+
+            if render:
+                env_renderer.renderEnv(show=True, frames=True, iEpisode=trials, iStep=step, action_dict=action_dict)
+                if delay > 0:
+                    time.sleep(delay)
+
+            iFrame += 1
+
+            # Environment step
+            next_obs, all_rewards, done, _ = env.step(action_dict)
+            for a in range(env.get_num_agents()):
+                norm = max(1, max_lt(next_obs[a], np.inf))
+                next_obs[a] = np.clip(np.array(next_obs[a]) / norm, -1, 1)
+            # Update replay buffer and train agent
+            for a in range(env.get_num_agents()):
+                # agent.step(obs[a], action_dict[a], all_rewards[a], next_obs[a], done[a])
+                score += all_rewards[a]
+
+            obs = next_obs.copy()
+            if done['__all__']:
+                env_done = 1
+                break
+        # Epsilon decay
+        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+
+        done_window.append(env_done)
+        scores_window.append(score)  # save most recent score
+        scores.append(np.mean(scores_window))
+        dones_list.append((np.mean(done_window)))
+
+        print(('\rTraining {} Agents.\tEpisode {}\tAverage Score: {:.0f}\tDones: {:.2f}%' +
+               '\tEpsilon: {:.2f} \t Action Probabilities: \t {}').format(
+            env.get_num_agents(),
+            trials,
+            np.mean(scores_window),
+            100 * np.mean(done_window),
+            eps, action_prob / np.sum(action_prob)),
+            end=" ")
+        if trials % 100 == 0:
+            tNow = time.time()
+            rFps = iFrame / (tNow - tStart)
+            print(('\rTraining {} Agents.\tEpisode {}\tAverage Score: {:.0f}\tDones: {:.2f}%' +
+                   '\tEpsilon: {:.2f} fps: {:.2f} \t Action Probabilities: \t {}').format(
+                env.get_num_agents(),
+                trials,
+                np.mean(scores_window),
+                100 * np.mean(done_window),
+                eps, rFps, action_prob / np.sum(action_prob)))
+            action_prob = [1] * 4
+
+
+if __name__ == "__main__":
+    with Benchmarker(cycle=20, extra=1) as bench:
+        @bench("Everything")
+        def _(bm):
+            main(render=False, delay=0)
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 4b288cee88f4231f330c51f70b1cd9c9f9d05389..cb11e71ae9f86dc4021665dc37172e1d66fda20b 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -7,6 +7,7 @@ tox==3.5.2
 coverage==4.5.1
 Sphinx==1.8.1
 twine==1.12.1
+benchmarker==4.0.1
 
 pytest==3.8.2
 pytest-runner==4.2
diff --git a/tox.ini b/tox.ini
index 6dd011aadeb2e7ba802ff692278aa763fb665f10..939334a6315c5758ca3d06d967fc7fcbbbbfaa21 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py36, py37, flake8, docs, coverage, xvfb-run, sh
+envlist = py36, py37, flake8, docs, coverage, benchmark, sh
 
 [travis]
 python =
@@ -13,27 +13,42 @@ ignore = E121 E126 E123 E128 E133 E226 E241 E242 E704 W291 W293 W391 W503 W504 W
 [testenv:flake8]
 basepython = python
 deps = flake8
-commands = flake8 flatland tests examples
+commands = flake8 flatland tests examples benchmarks
 
 [testenv:docs]
 basepython = python
 whitelist_externals = make
+passenv =
+    DISPLAY
 commands = make docs
 
 [testenv:coverage]
 basepython = python
 whitelist_externals = make
+passenv =
+    DISPLAY
 commands =
     pip install -U pip
     pip install -r requirements_dev.txt
     make coverage
 
+[testenv:benchmark]
+basepython = python
+setenv =
+    PYTHONPATH = {toxinidir}
+passenv =
+    DISPLAY
+whitelist_externals = sh
+commands =
+    sh -c 'ls benchmarks/*.py  | xargs -n 1 python'
+
 [testenv]
-whitelist_externals = xvfb-run
-                      sh
+whitelist_externals = sh
                       pip
 setenv =
     PYTHONPATH = {toxinidir}
+passenv =
+    DISPLAY
 deps =
     -r{toxinidir}/requirements_dev.txt
 ; If you want to make tox run the tests with the same versions, create a
@@ -43,6 +58,6 @@ commands =
     pip install -U pip
     pip install -r requirements_dev.txt
     sh -c 'echo DISPLAY: $DISPLAY'
-    xvfb-run -a py.test --basetemp={envtmpdir}
+    py.test --basetemp={envtmpdir}