plots.py

import matplotlib.pyplot as plt
import pandas as pd
import re

benchmark_pattern = "(?P<system>(MonadBayes|Anglican|WebPPL))_(?P<model>(LR|HMM|LDA))(?P<length>[0-9]+)_(?P<alg>(SMC(?P<smcparam>[0-9]+$)|MH(?P<mhparam>[0-9]+$)|RMSMC(?P<rmsmcparam>[0-9]+-[0-9]+$)))"
benchmark_reg = re.compile(benchmark_pattern)
rmsmc_pattern = "(?P<particles>[0-9]+)-(?P<steps>[0-9]+)"
rmsmc_reg = re.compile(rmsmc_pattern)


def unpack_name(benchmark_name):
    m = benchmark_reg.match(benchmark_name)
    if m is None:
        return None

    def lookup(property_name):
        return m.expand("\g<" + property_name + ">")

    system = lookup("system")
    model = lookup("model")
    length = int(lookup("length"))
    alg = lookup("alg")
    if alg[:3] == "SMC":
        alg_name = "SMC"
        particles = int(lookup("smcparam"))
        steps = 0
    elif alg[:2] == "MH":
        alg_name = "MH"
        particles = 0
        steps = int(lookup("mhparam"))
    elif alg[:5] == "RMSMC":
        alg_name = "RMSMC"
        t = rmsmc_reg.match(lookup("rmsmcparam"))
        particles = int(t.expand("\g<particles>"))
        steps = int(t.expand("\g<steps>"))
    else:
        raise ValueError("Unrecognized algorithm: " + alg)
    return system, model, length, alg_name, particles, steps


def unpack_names(series):
    x = list(filter(lambda y: y is not None, [unpack_name(name) for name in series]))
    systems = [y[0] for y in x]
    models = [y[1] for y in x]
    lengths = [y[2] for y in x]
    algs = [y[3] for y in x]
    particless = [y[4] for y in x]
    stepss = [y[5] for y in x]
    return pd.DataFrame(
        {
            "system": systems,
            "model": models,
            "length": lengths,
            "alg": algs,
            "particles": particless,
            "steps": stepss,
        }
    )


def style(system):
    if system == "MonadBayes":
        return "ro"
    elif system == "Anglican":
        return "bs"
    else:
        return "gX"


models = ["LR", "HMM", "LDA"]
algs = ["MH", "SMC", "RMSMC"]
systems = ["MonadBayes", "Anglican", "WebPPL"]


# plot execution time vs. dataset size

benchmarks = pd.read_csv("speed-length.csv")
results = unpack_names(benchmarks["Name"])
results["time"] = benchmarks["Mean"]
results["timeLB"] = benchmarks["MeanLB"]
results["timeUB"] = benchmarks["MeanUB"]


mhsteps = 100
smcsize = 100
rmsize, rmsteps = 10, 1

fig, subplots = plt.subplots(nrows=len(models), ncols=len(algs), figsize=(12, 8))
lines = []
for i in range(len(models)):
    model = models[i]
    for j in range(len(algs)):
        alg = algs[j]
        subplot = subplots[i, j]
        data = results.loc[(results["model"] == model) & (results["alg"] == alg)]
        if alg == "MH":
            data = data.loc[data["steps"] == mhsteps]
        elif alg == "SMC":
            data = data.loc[data["particles"] == smcsize]
        else:
            data = data.loc[(data["steps"] == rmsteps) & (data["particles"] == rmsize)]
        for system in systems:
            t = data.loc[data["system"] == system]
            xs = t["length"]
            ys = t["time"]
            if model == "LDA":
                # LDA has 5 documents
                xs = xs * 5
            (line,) = subplot.plot(xs, ys, style(system), label=system)
            lines.append((line, system))
        if i == len(models) - 1:
            subplot.set_xlabel("Dataset size")
        if j == 0:
            subplot.set_ylabel("Execution time [s]")

pad = 5
algnames = [
    "MH" + str(mhsteps),
    "SMC" + str(smcsize),
    "RMSMC" + str(rmsize) + "-" + str(rmsteps),
]
for ax, col in zip(subplots[0], algnames):
    ax.annotate(
        col,
        xy=(0.5, 1),
        xytext=(0, pad),
        xycoords="axes fraction",
        textcoords="offset points",
        size="large",
        ha="center",
        va="baseline",
    )

for ax, row in zip(subplots[:, 0], models):
    ax.annotate(
        row,
        xy=(0, 0.5),
        xytext=(-ax.yaxis.labelpad - pad, 0),
        xycoords=ax.yaxis.label,
        textcoords="offset points",
        size="large",
        ha="right",
        va="center",
    )

a, b = zip(*lines[:3])
b = ("Ours", b[1], b[2])
plt.figlegend(a, b, "upper right")
plt.savefig("length.pdf")


# plot execution time vs. # samples

benchmarks = pd.read_csv("speed-samples.csv")
results = unpack_names(benchmarks["Name"])
results["time"] = benchmarks["Mean"]
results["timeLB"] = benchmarks["MeanLB"]
results["timeUB"] = benchmarks["MeanUB"]

lrlength = 50
hmmlength = 20
ldalength = 10
rmparticles = 10
fig, subplots = plt.subplots(nrows=len(models), ncols=len(algs), figsize=(12, 8))
lines = []
for i in range(len(models)):
    model = models[i]
    for j in range(len(algs)):
        alg = algs[j]
        subplot = subplots[i, j]
        data = results.loc[(results["model"] == model) & (results["alg"] == alg)]
        if model == "LR":
            data = data.loc[data["length"] == lrlength]
        elif model == "HMM":
            data = data.loc[data["length"] == hmmlength]
        else:
            data = data.loc[data["length"] == ldalength]
        for system in systems:
            t = data.loc[data["system"] == system]
            if alg == "MH":
                xs = t["steps"]
                if i == len(models) - 1:
                    subplot.set_xlabel("Number of steps")
            elif alg == "SMC":
                xs = t["particles"]
                if i == len(models) - 1:
                    subplot.set_xlabel("Number of particles")
            else:
                t = t.loc[t["particles"] == rmparticles]
                xs = t["steps"]
                if i == len(models) - 1:
                    subplot.set_xlabel("Number of rejuvenation steps")
            ys = t["time"]
            (line,) = subplot.plot(xs, ys, style(system), label=system)
            lines.append((line, system))
        if j == 0:
            subplot.set_ylabel("Execution time [s]")

pad = 5
algnames = ["MH", "SMC", "RMSMC" + str(rmsize)]
for ax, col in zip(subplots[0], algnames):
    ax.annotate(
        col,
        xy=(0.5, 1),
        xytext=(0, pad),
        xycoords="axes fraction",
        textcoords="offset points",
        size="large",
        ha="center",
        va="baseline",
    )

modelnames = ["LR" + str(lrlength), "HMM" + str(hmmlength), "LDA" + str(ldalength * 5)]
for ax, row in zip(subplots[:, 0], modelnames):
    ax.annotate(
        row,
        xy=(0, 0.5),
        xytext=(-ax.yaxis.labelpad - pad, 0),
        xycoords=ax.yaxis.label,
        textcoords="offset points",
        size="large",
        ha="right",
        va="center",
    )

a, b = zip(*lines[:3])
b = ("Ours", b[1], b[2])
plt.figlegend(a, b, "upper right")
plt.savefig("samples.pdf")