diff --git a/montblanc/examples/MS_single_gpu_gradient_example.py b/montblanc/examples/MS_single_gpu_gradient_example.py
new file mode 100644
index 000000000..fe3a61e4d
--- /dev/null
+++ b/montblanc/examples/MS_single_gpu_gradient_example.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Compare analytical chi squared gradient computation with respect to 
+# Sersic parameters with the numerical gradent computation. 
+ 
+# Input: filename - Measurement Set file
+#        ns - number of sersic sources
+
+import sys
+import argparse
+import math
+import numpy as np
+import montblanc
+import montblanc.util as mbu
+
+from  montblanc.config import RimeSolverConfig as Options
+
+ARCS2RAD = np.pi/648000.
+
+#scalelength in arcsec
+minscale = 0.3*ARCS2RAD
+maxscale = 3.5*ARCS2RAD
+
+parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
+parser.add_argument('msfile', help='Input MS filename')
+parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
+
+args = parser.parse_args(sys.argv[1:])
+
+# Get the RIME solver
+# Enable gradient computation: sersic_gradient=True
+slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
+        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
+        init_weights=None, weight_vector=False,  
+        sersic_gradient=True, dtype='double', version='v4')
+
+with montblanc.rime_solver(slvr_cfg) as slvr:
+
+    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
+    np.random.seed(14352)
+    
+    # Random source coordinates in the l,m (brightness image) domain
+    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
+    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
+    lm=mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
+    slvr.transfer_lm(lm)
+
+    # Brightness matrix for sources
+    stokes = np.empty(shape=slvr.stokes.shape, dtype=slvr.stokes.dtype)
+    I, Q, U, V = stokes[:,:,0], stokes[:,:,1], stokes[:,:,2], stokes[:,:,3]
+    I[:] = np.ones(shape=I.shape)*70.
+    Q[:] = np.zeros(shape=Q.shape)
+    U[:] = np.zeros(shape=U.shape)
+    V[:] = np.zeros(shape=V.shape)
+    slvr.transfer_stokes(stokes)
+    alpha = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
+    slvr.transfer_alpha(alpha)
+    
+    # If there are sersic sources, create their
+    # shape matrix and transfer it.
+    mod = slvr.ft(np.random.random(nssrc))*0.3
+    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
+    e1 = mod*np.sin(angle)
+    e2 = mod*np.cos(angle)
+    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
+    sersic_shape = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
+    slvr.transfer_sersic_shape(sersic_shape)
+    print sersic_shape
+
+    #Set visibility noise variance (muJy)
+    time_acc = 60
+    efficiency = 0.9
+    channel_bandwidth_hz = 20e6
+    SEFD = 400e6
+    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
+    slvr.set_sigma_sqrd(sigma)
+
+    # Create observed data and upload it to the GPU
+    slvr.solve()
+    with slvr.context:
+        observed = slvr.retrieve_model_vis()
+
+    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noise = noiseR +1j*noiseI
+    observed = observed+noise
+    slvr.transfer_observed_vis(observed)
+ 
+    slvr.solve() 
+
+    print slvr
+    print "analytical: ",slvr.X2_grad
+
+    # Execute the pipeline
+    l1 = slvr.X2
+    dq = np.empty([3,nssrc])
+    for i in xrange(nssrc):
+        e1_inc = sersic_shape.copy()
+        e1_inc[0,i] = e1_inc[0,i]+0.000001
+        slvr.transfer_sersic_shape(e1_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[0,i] = (l2-l1)*1e6
+        e2_inc = sersic_shape.copy()
+        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
+        slvr.transfer_sersic_shape(e2_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[1,i] = (l2-l1)*1e6
+        R_inc = sersic_shape.copy()
+        R_inc[2,i] = R_inc[2,i]+0.00000001
+        slvr.transfer_sersic_shape(R_inc)
+        slvr.solve()
+        l2 = slvr.X2
+        dq[2,i] = (l2-l1)*1e8
+    print "numeric: ",dq
diff --git a/montblanc/examples/MS_single_node_gradient_example.py b/montblanc/examples/MS_single_node_gradient_example.py
new file mode 100644
index 000000000..41637fa76
--- /dev/null
+++ b/montblanc/examples/MS_single_node_gradient_example.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+# Compare analytical chi squared gradient computation with respect to 
+# Sersic parameters with the numerical gradent computation. 
+ 
+# Input: filename - Measurement Set file
+#        ns - number of sersic sources
+
+import sys
+import argparse
+import math
+import numpy as np
+import montblanc
+import montblanc.util as mbu
+
+from  montblanc.config import RimeSolverConfig as Options
+
+ARCS2RAD = np.pi/648000.
+
+#scalelength in arcsec
+minscale = 0.3*ARCS2RAD
+maxscale = 3.5*ARCS2RAD
+
+parser = argparse.ArgumentParser(description='TEST SERSIC GRADIENT')
+parser.add_argument('msfile', help='Input MS filename')
+parser.add_argument('-ns',dest='nssrc', type=int, default=1, help='Number of Sersic Galaxies')
+
+args = parser.parse_args(sys.argv[1:])
+
+# Get the RIME solver
+# Enable gradient computation: sersic_gradient=True
+slvr_cfg = montblanc.rime_solver_cfg(msfile=args.msfile,
+        sources=montblanc.sources(point=0, gaussian=0, sersic=args.nssrc),
+        init_weights=None, weight_vector=False,  
+        sersic_gradient=True, dtype='double', version='v5')
+
+with montblanc.rime_solver(slvr_cfg) as slvr:
+
+    nsrc, nssrc, ntime, nchan = slvr.dim_local_size('nsrc', 'nssrc', 'ntime', 'nchan')
+    np.random.seed(14352)    
+
+    # Random source coordinates in the l,m (brightness image) domain
+    l= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD)
+    m= slvr.ft(np.random.random(nsrc)*100*ARCS2RAD) 
+    slvr.lm[:] = mbu.shape_list([l,m], shape=slvr.lm.shape, dtype=slvr.lm.dtype)
+
+    # Brightness matrix for sources
+    I, Q, U, V = slvr.stokes[:,:,0], slvr.stokes[:,:,1], slvr.stokes[:,:,2], slvr.stokes[:,:,3]
+    I[:] = np.ones(shape=I.shape)*70.
+    Q[:] = np.zeros(shape=Q.shape)
+    U[:] = np.zeros(shape=U.shape)
+    V[:] = np.zeros(shape=V.shape)
+    slvr.alpha[:] = slvr.ft(np.ones(nssrc*ntime)*(-0.7)).reshape(nsrc,ntime)
+    
+    # If there are sersic sources, create their
+    # shape matrix and transfer it.
+    mod = slvr.ft(np.random.random(nssrc))*0.3
+    angle = slvr.ft(np.random.random(nssrc))*2*np.pi
+    e1 = mod*np.sin(angle)
+    e2 = mod*np.cos(angle)
+    R = slvr.ft(np.random.random(nssrc)*(maxscale-minscale))+minscale
+    slvr.sersic_shape[:] = slvr.ft(np.array([e1,e2,R])).reshape((3,nssrc))
+    print slvr.sersic_shape
+
+    #Set visibility noise variance (muJy)
+    time_acc = 60
+    efficiency = 0.9
+    channel_bandwidth_hz = 20e6
+    SEFD = 400e6
+    sigma = (SEFD*SEFD)/(2.*time_acc*channel_bandwidth_hz*efficiency*efficiency)
+    slvr.set_sigma_sqrd(sigma)
+
+    # Create observed data and upload it to the GPU
+    slvr.solve()
+    observed = slvr.model_vis[:].copy() 
+
+    noiseR = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noiseI = np.random.normal(0,np.sqrt(sigma),size=observed.shape)
+    noise = noiseR +1j*noiseI
+    slvr.observed_vis[:] = (observed+noise).copy()
+ 
+    slvr.solve()
+
+    print slvr 
+    print "analytical: ",slvr.X2_grad
+
+    # Execute the pipeline
+    l1 = slvr.X2
+    sersic_shape = slvr.sersic_shape.copy()
+    dq = np.empty([3,nssrc])
+    for i in xrange(nssrc):
+        e1_inc = sersic_shape.copy()
+        e1_inc[0,i] = e1_inc[0,i]+0.000001
+        slvr.sersic_shape[:] = e1_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[0,i] = (l2-l1)*1e6
+        e2_inc = sersic_shape.copy()
+        e2_inc[1,i] = e2_inc[1,i]+ 0.000001
+        slvr.sersic_shape[:] = e2_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[1,i] = (l2-l1)*1e6
+        R_inc = sersic_shape.copy()
+        R_inc[2,i] = R_inc[2,i]+0.00000001
+        slvr.sersic_shape[:] = R_inc
+        slvr.solve()
+        l2 = slvr.X2
+        dq[2,i] = (l2-l1)*1e8
+    print "numeric: ",dq
+
+
diff --git a/montblanc/factory.py b/montblanc/factory.py
index f9d3f4ad9..3be22fd04 100644
--- a/montblanc/factory.py
+++ b/montblanc/factory.py
@@ -137,12 +137,13 @@ def create_rime_solver_from_ms(slvr_class_type, slvr_cfg):
     autocor = slvr_cfg.get(Options.AUTO_CORRELATIONS)
 
     with MeasurementSetLoader(msfile, auto_correlations=autocor) as loader:
-        ntime, nbl, na, nbands, nchan = loader.get_dims()
+        ntime, nbl, na, nbands, nchan, npol = loader.get_dims()
         slvr_cfg[Options.NTIME] = ntime
         slvr_cfg[Options.NA] = na
         slvr_cfg[Options.NBL] = nbl
         slvr_cfg[Options.NBANDS] = nbands
         slvr_cfg[Options.NCHAN] = nchan
+        slvr_cfg[Options.NPOL] = npol
         slvr = slvr_class_type(slvr_cfg)
         loader.load(slvr, slvr_cfg)
         return slvr
@@ -167,7 +168,8 @@ def rime_solver(slvr_cfg):
     elif version == Options.VERSION_FIVE:
         from montblanc.impl.rime.v5.CompositeRimeSolver \
         import CompositeRimeSolver as RimeSolver
-        slvr_cfg[Options.CONTEXT] = __contexts
+        if slvr_cfg.get(Options.CONTEXT, None) is None: 
+            slvr_cfg[Options.CONTEXT] = __contexts
     else:
         raise ValueError('Invalid version %s' % version)
 
diff --git a/montblanc/impl/common/loaders/loaders.py b/montblanc/impl/common/loaders/loaders.py
index b6f31d9cd..3ac524989 100644
--- a/montblanc/impl/common/loaders/loaders.py
+++ b/montblanc/impl/common/loaders/loaders.py
@@ -31,6 +31,7 @@
 ANTENNA_TABLE = 'ANTENNA'
 SPECTRAL_WINDOW = 'SPECTRAL_WINDOW'
 FIELD_TABLE = 'FIELD'
+POL_TABLE = 'POLARIZATION'
 
 class MeasurementSetLoader(BaseLoader):
     LOG_PREFIX = 'LOADER:'
@@ -43,6 +44,7 @@ def __init__(self, msfile, auto_correlations=False):
         self.antfile = '::'.join((self.msfile, ANTENNA_TABLE))
         self.freqfile = '::'.join((self.msfile, SPECTRAL_WINDOW))
         self.fieldfile = '::'.join((self.msfile, FIELD_TABLE))
+        self.polfile = '::'.join((self.msfile, POL_TABLE))
 
         montblanc.log.info("{lp} Opening Measurement Set {ms}.".format(
             lp=self.LOG_PREFIX, ms=self.msfile))
@@ -58,6 +60,7 @@ def __init__(self, msfile, auto_correlations=False):
         # (2) baseline (ANTENNA1, ANTENNA2)
         # (3) band (SPECTRAL_WINDOW_ID via DATA_DESC_ID)
         ordering_query = ' '.join(["SELECT FROM $ms",
+            #"WHERE ANTENNA1 != ANTENNA2",
             "WHERE FIELD_ID={fid}".format(fid=field_id),
             "" if auto_correlations else "AND ANTENNA1 != ANTENNA2",
             "ORDERBY TIME, ANTENNA1, ANTENNA2, "
@@ -70,6 +73,7 @@ def __init__(self, msfile, auto_correlations=False):
         self.tables['ant']  = at = pt.table(self.antfile, ack=False, readonly=True)
         self.tables['freq'] = ft = pt.table(self.freqfile, ack=False, readonly=True)
         self.tables['field'] = fit = pt.table(self.fieldfile, ack=False, readonly=True)
+        self.tables['pol'] = polt = pt.table(self.polfile, ack=False, readonly=True)
 
         self.nrows = ordered_ms.nrows()
         self.na = at.nrows()
@@ -80,6 +84,9 @@ def __init__(self, msfile, auto_correlations=False):
         bl_query = "SELECT FROM $ordered_ms ORDERBY UNIQUE ANTENNA1, ANTENNA2"
         self.nbl = pt.taql(bl_query).nrows()
 
+        # Number of polarizations (assuming to be the same for all spectral windows)
+        self.npol = polt.getcol('NUM_CORR')[0]
+
         # Number of channels per band
         chan_per_band = ft.getcol('NUM_CHAN')
 
@@ -105,6 +112,8 @@ def __init__(self, msfile, auto_correlations=False):
             aeq=autocor_eq, ebl=expected_nbl, astr=autocor_str, nbl=self.nbl))
         self.log("Found {nb} band(s), containing {cpb} channels.".format(
             nb=self.nbands, nc=chan_per_band[0], cpb=chan_per_band))
+        self.log("Found {npol} polarization(s) in the POLARIZATION table.".format(
+            npol=self.npol))
 
         # Sanity check computed rows vs actual rows
         computed_rows = self.ntime*self.nbl*self.nbands
@@ -116,13 +125,14 @@ def __init__(self, msfile, auto_correlations=False):
                         nbl=self.nbl, nb=self.nbands,
                         cr=computed_rows, msr=self.nrows))
 
+
     def get_dims(self):
         """
         Returns a tuple with the number of
         timesteps, baselines, antenna, channel bands and channels
         """
         # Determine the problem dimensions
-        return self.ntime, self.nbl, self.na, self.nbands, self.nchan
+        return self.ntime, self.nbl, self.na, self.nbands, self.nchan, self.npol
 
     def log(self, msg, *args, **kwargs):
         montblanc.log.info('{lp} {m}'.format(lp=self.LOG_PREFIX, m=msg),
diff --git a/montblanc/impl/rime/slvr_config.py b/montblanc/impl/rime/slvr_config.py
index d5bae57b7..556534ed4 100644
--- a/montblanc/impl/rime/slvr_config.py
+++ b/montblanc/impl/rime/slvr_config.py
@@ -63,6 +63,14 @@ class RimeSolverConfig(SolverConfig):
         "If True, chi-squared terms are weighted with a vectorised sigma. "
         "If False, chi-squared terms are weighted with a single scalar sigma.")
 
+    # Enabling chi squared gradient computation with respect Sersic parameters
+    SERSIC_GRADIENT = 'sersic_gradient'
+    DEFAULT_SERSIC_GRADIENT = False
+    VALID_SERSIC_GRADIENT = [True, False]
+    SERSIC_GRADIENT_DESCRIPTION = (
+        "If True, chi_squared gradient is computed with respect to the sersic parameters for each source.")
+
+
     # weight vector initialisation keyword and valid values
     # This SolverConfig determines whether
     INIT_WEIGHTS = 'init_weights'
@@ -198,6 +206,13 @@ class RimeSolverConfig(SolverConfig):
             SolverConfig.DEFAULT: DEFAULT_E_BEAM_DEPTH,
             SolverConfig.REQUIRED: True
         },
+
+        SERSIC_GRADIENT: {
+            SolverConfig.DESCRIPTION: SERSIC_GRADIENT_DESCRIPTION,
+            SolverConfig.VALID: VALID_SERSIC_GRADIENT,
+            SolverConfig.DEFAULT: DEFAULT_SERSIC_GRADIENT,
+            SolverConfig.REQUIRED: True
+        },
     }
 
     def parser(self):
@@ -221,6 +236,12 @@ def parser(self):
             help=self.E_BEAM_DEPTH_DESCRIPTION,
             default=self.DEFAULT_E_BEAM_DEPTH)
 
+        p.add_argument('--{v}'.format(v=self.SERSIC_GRADIENT),
+            required=False,
+            type=bool,
+            help=self.SERSIC_GRADIENT_DESCRIPTION,
+            default=self.DEFAULT_SERSIC_GRADIENT)
+
         p.add_argument('--{v}'.format(v=self.WEIGHT_VECTOR),
             required=False,
             type=bool,
@@ -274,4 +295,4 @@ def parser(self):
             help=self.VERSION_DESCRIPTION,
             default=self.DEFAULT_VERSION)
 
-        return p
\ No newline at end of file
+        return p
diff --git a/montblanc/impl/rime/v4/RimeSolver.py b/montblanc/impl/rime/v4/RimeSolver.py
index 895e7f30e..fc3d519ad 100644
--- a/montblanc/impl/rime/v4/RimeSolver.py
+++ b/montblanc/impl/rime/v4/RimeSolver.py
@@ -30,14 +30,24 @@
 from montblanc.impl.rime.v4.gpu.RimeBSqrt import RimeBSqrt
 from montblanc.impl.rime.v4.gpu.RimeEKBSqrt import RimeEKBSqrt
 from montblanc.impl.rime.v4.gpu.RimeSumCoherencies import RimeSumCoherencies
+from montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient
 
 from montblanc.pipeline import Pipeline
 
 def get_pipeline(slvr_cfg):
-    return Pipeline([RimeBSqrt(),
-        RimeEBeam(),
-        RimeEKBSqrt(),
-        RimeSumCoherencies()])
+    sersic_grad = slvr_cfg[Options.SERSIC_GRADIENT]
+
+    if (sersic_grad):
+        return Pipeline([RimeBSqrt(),
+            RimeEBeam(),
+            RimeEKBSqrt(),
+            RimeSumCoherencies(),
+            SersicChiSquaredGradient()])
+    else:
+        return Pipeline([RimeBSqrt(),
+            RimeEBeam(),
+            RimeEKBSqrt(),
+            RimeSumCoherencies()])
 
 class RimeSolver(MontblancCUDASolver):
     """ RIME Solver Implementation """
diff --git a/montblanc/impl/rime/v4/config.py b/montblanc/impl/rime/v4/config.py
index 6295490fd..4c7cb6a3c 100644
--- a/montblanc/impl/rime/v4/config.py
+++ b/montblanc/impl/rime/v4/config.py
@@ -127,6 +127,8 @@ class Classifier(Enum):
     EKB_SQRT_INPUT = 'ekb_sqrt_input'
     COHERENCIES_INPUT = 'coherencies_input'
 
+
+
 # List of arrays
 A = [
     # UVW coordinates
@@ -177,15 +179,17 @@ class Classifier(Enum):
         default=1,
         test=lambda slvr, ary: rary(ary)),
 
-    ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 4), 'ct',
+    ary_dict('E_beam', ('beam_lw', 'beam_mh', 'beam_nud', 'npol'), 'ct',
         classifiers=frozenset([Classifier.E_BEAM_INPUT]),
-        default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:],
+        default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
     # Direction-Independent Effects
-    ary_dict('G_term', ('ntime', 'na', 'nchan', 4), 'ct',
+    ary_dict('G_term', ('ntime', 'na', 'nchan', 'npol'), 'ct',
         classifiers=frozenset([Classifier.COHERENCIES_INPUT]),
-        default=np.array([1,0,0,1])[np.newaxis,np.newaxis,np.newaxis,:],
+        default=lambda s, ary: (np.array([1,0,0,1])[np.newaxis,np.newaxis,:] if ary.shape[3] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=lambda slvr, ary: rary(ary)),
 
     # Source Definitions
@@ -195,9 +199,10 @@ class Classifier(Enum):
         default=0,
         test=lambda slvr, ary: (rary(ary)-0.5) * 1e-1),
 
-    ary_dict('stokes', ('nsrc','ntime', 4), 'ft',
+    ary_dict('stokes', ('nsrc','ntime', 'npol'), 'ft',
         classifiers=frozenset([Classifier.B_SQRT_INPUT]),
-        default=np.array([1,0,0,0])[np.newaxis,np.newaxis,:],
+        default= lambda s, ary: (np.array([1,0,0,0])[np.newaxis,np.newaxis,:] if ary.shape[2] == 4 
+            else np.array([1])[np.newaxis,np.newaxis,:]),
         test=rand_stokes),
 
     ary_dict('alpha', ('nsrc','ntime'), 'ft',
@@ -216,7 +221,7 @@ class Classifier(Enum):
         test=rand_sersic_shape),
 
     # Visibility flagging arrays
-    ary_dict('flag', ('ntime', 'nbl', 'nchan', 4), np.uint8,
+    ary_dict('flag', ('ntime', 'nbl', 'nchan', 'npol'), np.uint8,
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=0,
@@ -224,24 +229,26 @@ class Classifier(Enum):
             0, 1, size=ary.shape)),
 
     # Bayesian Data
-    ary_dict('weight_vector', ('ntime','nbl','nchan',4), 'ft',
+    ary_dict('weight_vector', ('ntime','nbl','nchan','npol'), 'ft',
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=1,
         test=lambda slvr, ary: rary(ary)),
-    ary_dict('observed_vis', ('ntime','nbl','nchan',4), 'ct',
+    ary_dict('observed_vis', ('ntime','nbl','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.X2_INPUT,
             Classifier.COHERENCIES_INPUT]),
         default=0,
         test=lambda slvr, ary: rary(ary)),
 
     # Result arrays
-    ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 4), 'ct',
+    ary_dict('B_sqrt', ('nsrc', 'ntime', 'nchan', 'npol'), 'ct',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
-    ary_dict('jones', ('nsrc','ntime','na','nchan',4), 'ct',
+    ary_dict('jones', ('nsrc','ntime','na','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
-    ary_dict('model_vis', ('ntime','nbl','nchan',4), 'ct',
+    ary_dict('model_vis', ('ntime','nbl','nchan','npol'), 'ct',
         classifiers=frozenset([Classifier.SIMULATOR_OUTPUT])),
     ary_dict('chi_sqrd_result', ('ntime','nbl','nchan'), 'ft',
         classifiers=frozenset([Classifier.GPU_SCRATCH])),
+    ary_dict('X2_grad', (3,'nssrc',), 'ft',
+        classifiers=frozenset([Classifier.GPU_SCRATCH])),
 ]
diff --git a/montblanc/impl/rime/v4/cpu/CPUSolver.py b/montblanc/impl/rime/v4/cpu/CPUSolver.py
index 1d27f8e24..7946b3599 100644
--- a/montblanc/impl/rime/v4/cpu/CPUSolver.py
+++ b/montblanc/impl/rime/v4/cpu/CPUSolver.py
@@ -675,6 +675,144 @@ def compute_gekb_vis(self, ekb_vis=None):
 
         return result
 
+
+    def compute_sersic_derivatives(self):
+        """
+        Compute the partial derivative values for the sersic (exponential) sources.
+        Returns a (nssrc, ntime, nbl, nchan) matrix of floating point scalars.
+        """
+
+        nssrc, ntime, nbl, nchan  = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan')
+        ant0, ant1 = self.ap_idx()
+
+        # Calculate per baseline u from per antenna u
+        up, uq = self.uvw[:,:,0][ant0], self.uvw[:,:,0][ant1]
+        u = ne.evaluate('uq-up', {'up': up, 'uq': uq})
+
+        # Calculate per baseline v from per antenna v
+        vp, vq = self.uvw[:,:,1][ant0], self.uvw[:,:,1][ant1]
+        v = ne.evaluate('vq-vp', {'vp': vp, 'vq': vq})
+
+        # Calculate per baseline w from per antenna w
+        wp, wq = self.uvw[:,:,2][ant0], self.uvw[:,:,2][ant1]
+        w = ne.evaluate('wq-wp', {'wp': wp, 'wq': wq})
+
+        e1 = self.sersic_shape[0]
+        e2 = self.sersic_shape[1]
+        R = self.sersic_shape[2]
+
+        # OK, try obtain the same results with the fwhm factored out!
+        # u1 = u*(1+e1) - v*e2
+        # v1 = u*e2 + v*(1-e1)
+        u1 = ne.evaluate('u_1_e1 + v_e2',
+            { 'u_1_e1': np.outer(1+e1, u), 'v_e2' : np.outer(e2, v)})
+        v1 = ne.evaluate('u_e2 + v_1_e1', 
+            { 'u_e2' : np.outer(e2, u), 'v_1_e1' : np.outer(1-e1,v)})
+
+        assert u1.shape == (nssrc, ntime*nbl)
+        assert v1.shape == (nssrc, ntime*nbl)
+
+        e12 = 1 - e1 * e1 - e2 * e2
+
+        de1 = ne.evaluate('u1*(u_e12+2*e1*u1) + v1*(2*e1*v1-v_e12)',
+            local_dict={
+                'u1'   : u1,  'v1': v1,\
+                'u_e12': np.outer(e12, u),\
+                'v_e12': np.outer(e12, v),\
+                'e1': e1[:,np.newaxis]}).reshape(nssrc, ntime,nbl)
+
+        de2 = ne.evaluate('u1*(v_e12+2*e2*u1) + v1*(2*e2*v1+u_e12)',
+            local_dict={
+                'u1'   : u1,  'v1': v1,\
+                'u_e12': np.outer(e12, u),\
+                'v_e12': np.outer(e12, v),\
+                'e2': e2[:,np.newaxis]}).reshape(nssrc, ntime,nbl)
+
+        wavenum = (self.two_pi_over_c * self.frequency)
+
+        temp = ne.evaluate('(wavenum*R/e12)**2',
+            {
+              'R'  : R[:,np.newaxis],\
+              'e12': e12[:,np.newaxis],\
+              'wavenum': wavenum[np.newaxis,:]}).reshape(nssrc,nchan)
+
+        sfactor = ne.evaluate('1+temp*(u1*u1+v1*v1)',
+            {
+              'u1' : u1[:,:,np.newaxis],\
+              'v1' : v1[:,:,np.newaxis],\
+              'temp'  : temp[:,np.newaxis,:]}).reshape(nssrc,ntime,nbl,nchan)
+
+        sersic_deriv = np.empty((3,nssrc,ntime,nbl,nchan))
+          
+        # partial derivatives with respect to e1 and e2
+        sersic_deriv[0] = ne.evaluate('de1*temp/(sfactor*e12)',
+            { 
+              'de1' : de1[:,:,:,np.newaxis],\
+              'temp': temp[:,np.newaxis,np.newaxis,:],\
+              'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        sersic_deriv[1] = ne.evaluate('de2*temp/(sfactor*e12)',
+            {
+              'de2' : de2[:,:,:,np.newaxis],\
+              'temp': temp[:,np.newaxis,np.newaxis,:], \
+              'e12' : e12[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor': sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        # partial derivatives with respect to scalelength
+        sersic_deriv[2] = ne.evaluate('(sfactor-1)/(sfactor*R)',
+            {
+              'R' : R[:,np.newaxis,np.newaxis,np.newaxis],\
+              'sfactor' : sfactor }).reshape(nssrc,ntime,nbl,nchan)
+
+        return sersic_deriv
+
+
+
+
+    def compute_gekb_vis_grad(self, ekb_jones=None):
+        """
+        Computes the gradient of the complex visibilities with respect to the sersic parameters.
+        Returns a (3,nssrc,ntime,nbl,nchan,4) matrix of complex scalars.
+        """
+
+        nssrc, npsrc, ngsrc, nsrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'npsrc', 'ngsrc', 'nsrc', 'ntime', 'nbl', 'nchan')
+
+        if ekb_jones is None:
+            ekb_jones = self.compute_ekb_jones_per_bl()
+
+        want_shape = (nsrc, ntime, nbl, nchan, 4)
+        assert ekb_jones.shape == want_shape, \
+            'Expected shape %s. Got %s instead.' % \
+            (want_shape, ekb_jones.shape)
+
+        ant0, ant1 = self.ap_idx(chan=True)
+        g_term_p = self.G_term[ant0]
+        g_term_q = self.G_term[ant1]
+
+        assert g_term_p.shape == (ntime, nbl, nchan, 4)
+
+        gekb_jones = np.empty([nssrc,ntime,nbl,nchan,4])
+        for i in range(nssrc):
+            gekb_jones[i] = (self.jones_multiply(g_term_p, ekb_jones[i])
+                .reshape(ntime, nbl, nchan, 4))
+            gekb_jones[i] = (self.jones_multiply(gekb_jones[i], g_term_q, hermitian=True)
+                .reshape(ntime, nbl, nchan, 4))
+
+        sersic_deriv = self.compute_sersic_derivatives()
+
+        src_beg = npsrc + ngsrc
+        src_end = npsrc + ngsrc + nssrc
+        vis_grad = np.empty((3,nssrc,ntime,nbl,nchan,4),dtype = np.complex)
+        vis_grad[0] = gekb_jones[src_beg:src_end]*sersic_deriv[0,:,:,:,:,np.newaxis]
+        vis_grad[1] = gekb_jones[src_beg:src_end]*sersic_deriv[1,:,:,:,:,np.newaxis]
+        vis_grad[2] = gekb_jones[src_beg:src_end]*sersic_deriv[2,:,:,:,:,np.newaxis]
+
+        assert vis_grad.shape == (3, nssrc, ntime, nbl, nchan, 4)
+
+        return vis_grad
+
+
     def compute_chi_sqrd_sum_terms(self, vis=None):
         """
         Computes the terms of the chi squared sum,
@@ -736,14 +874,72 @@ def compute_chi_sqrd(self, chi_sqrd_terms=None):
         return (term_sum if self.use_weight_vector() is True
             else term_sum / self.sigma_sqrd)
 
+
+    def compute_chi_sqrd_gradient(self, vis=None, vis_grad=None):
+        """ Computes the chi squared gradient with respect to Sersic parameters.
+        Parameters:
+            weight_vector : boolean
+                True if the chi squared test
+                should be computed with a noise vector
+        Returns a (3,nssrc) matrix of real values
+        """
+        nssrc, ntime, nbl, nchan = self.dim_local_size('nssrc', 'ntime', 'nbl', 'nchan')
+
+        # Do the chi squared gradient on the CPU.
+        # If we're not using the weight vector, sum and
+        # divide by the sigma squared.
+        # Otherwise, simply return the sum
+        if vis is None: vis = self.compute_gekb_vis()
+        if vis_grad is None: vis_grad = self.compute_gekb_vis_grad()
+
+
+        # Compute the residuals if this has not yet happened
+        if not self.outputs_residuals():
+            d = ne.evaluate('(ovis - mvis)*where(flag > 0, 0, 1)', {
+                'mvis': vis,
+                'ovis': self.observed_vis,
+                'flag' : self.flag })
+            assert d.shape == (ntime, nbl, nchan, 4)
+        else:
+            d = vis
+
+        re = d.real
+        im = d.imag
+        wv = self.weight_vector
+
+        # Multiply by the weight vector if required
+        if self.use_weight_vector() is True:
+            ne.evaluate('re*wv', {'re': re, 'wv': wv}, out=re)
+            ne.evaluate('im*wv', {'im': im, 'wv': wv}, out=im)       
+
+        # Multiply by the visibilities gradient
+        for p in xrange(3):
+            for s in xrange(nssrc):
+                vis_grad[p,s].real = ne.evaluate('re*dre', {'re': re, 'dre': vis_grad[p,s].real})
+                vis_grad[p,s].imag = ne.evaluate('im*dim', {'im': im, 'dim': vis_grad[p,s].imag})
+
+        # Sum the real and imaginary terms together
+        # for the final result.
+        re_sum = ne.evaluate('sum(re,5)', {'re': vis_grad.real})
+        im_sum = ne.evaluate('sum(im,5)', {'im': vis_grad.imag})
+        chi_sqrd_grad_terms = ne.evaluate('re_sum + im_sum',
+            {'re_sum': re_sum, 'im_sum': im_sum})
+        assert chi_sqrd_grad_terms.shape == (3, nssrc, ntime, nbl, nchan)
+
+        self.X2_grad = 6*np.sum(chi_sqrd_grad_terms,(2,3,4)).reshape(3,nssrc)
+           
+        return (self.X2_grad if self.use_weight_vector() is True \
+            else self.X2_grad / self.sigma_sqrd)
+
+
     def solve(self):
         """ Solve the RIME """
 
         self.jones[:] = self.compute_ekb_sqrt_jones_per_ant()
         
-        self.vis[:] = self.compute_gekb_vis()
+        self.model_vis[:] = self.compute_gekb_vis()
         
         self.chi_sqrd_result[:] = self.compute_chi_sqrd_sum_terms(
-            self.vis)
+            self.model_vis)
         
-        self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result))
\ No newline at end of file
+        self.set_X2(self.compute_chi_sqrd(self.chi_sqrd_result))
diff --git a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
index 784a2152a..0c6df45f3 100644
--- a/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
+++ b/montblanc/impl/rime/v4/gpu/RimeBSqrt.py
@@ -127,6 +127,59 @@
     B_sqrt[i] = B_square_root;
 }
 
+
+
+// Case NPOL = 1
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void rime_jones1_B_sqrt_impl(
+    T * stokes,
+    T * alpha,
+    T * frequency,
+    T * ref_frequency,
+    typename Tr::ct * B_sqrt)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int TIME = blockIdx.y*blockDim.y + threadIdx.y;
+    int SRC = blockIdx.z*blockDim.z + threadIdx.z;
+
+    if(SRC >= DEXT(nsrc) || TIME >= DEXT(ntime) || POLCHAN >= DEXT(nchan))
+        { return; }
+
+    __shared__ T freq_ratio[BLOCKDIMX];
+
+    // TODO. Using 3 times more shared memory than we
+    // really require here, since there's only
+    // one frequency per channel.
+    if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+        freq_ratio[threadIdx.x] = frequency[POLCHAN] / 
+            ref_frequency[POLCHAN];
+    }
+
+    __syncthreads();
+
+    // Calculate the power term
+    int i = SRC*NTIME + TIME;
+    typename Tr::ft power = Po::pow(freq_ratio[threadIdx.x], alpha[i]);
+
+    // Read in the stokes parameter,
+    // multiplying it by the power term
+    typename Tr::ft pol = 2.*stokes[i]*power;    // include both the equal terms XX and YY
+
+    // Write out the square root of the brightness
+    i = (SRC*NTIME + TIME)*NPOLCHAN + POLCHAN;
+    B_sqrt[i].x = Po::sqrt(pol);
+    B_sqrt[i].y = 0.;
+}
+
+
+
+
 extern "C" {
 
 #define stamp_jones_B_sqrt_fn(ft,ct) \
@@ -142,9 +195,27 @@
         frequency, ref_frequency, B_sqrt); \
 }
 
+#define stamp_jones1_B_sqrt_fn(ft,ct) \
+__global__ void \
+rime_jones1_B_sqrt_ ## ft( \
+    ft * stokes, \
+    ft * alpha, \
+    ft * frequency, \
+    ft * ref_frequency, \
+    ct * B_sqrt) \
+{ \
+    rime_jones1_B_sqrt_impl<ft>(stokes, alpha, \
+        frequency, ref_frequency, B_sqrt); \
+}
+
+
 stamp_jones_B_sqrt_fn(float,float2);
 stamp_jones_B_sqrt_fn(double,double2);
 
+stamp_jones1_B_sqrt_fn(float,float2);
+stamp_jones1_B_sqrt_fn(double,double2);
+
+
 } // extern "C" {
 """)
 
@@ -154,8 +225,8 @@ def __init__(self):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        nsrc, ntime, npolchan = slvr.dim_local_size(
-            'nsrc', 'ntime', 'npolchan')
+        nsrc, ntime, npolchan, npol = slvr.dim_local_size(
+            'nsrc', 'ntime', 'npolchan', 'npol')
 
         # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -171,9 +242,14 @@ def initialise(self, solver, stream=None):
         regs = str(FLOAT_PARAMS['maxregs'] \
                 if slvr.is_float() else DOUBLE_PARAMS['maxregs'])
 
-        kname = 'rime_jones_B_sqrt_float' \
-            if slvr.is_float() is True else \
-            'rime_jones_B_sqrt_double'
+        if npol == 1:
+            kname = 'rime_jones1_B_sqrt_float' \
+                if slvr.is_float() is True else \
+                'rime_jones1_B_sqrt_double'
+        else:
+            kname = 'rime_jones_B_sqrt_float' \
+                if slvr.is_float() is True else \
+                'rime_jones_B_sqrt_double'
 
         kernel_string = KERNEL_TEMPLATE.substitute(**D)
 
diff --git a/montblanc/impl/rime/v4/gpu/RimeEBeam.py b/montblanc/impl/rime/v4/gpu/RimeEBeam.py
index c174f2640..1d9215406 100644
--- a/montblanc/impl/rime/v4/gpu/RimeEBeam.py
+++ b/montblanc/impl/rime/v4/gpu/RimeEBeam.py
@@ -83,18 +83,19 @@
 #define BEAM_MH LOCAL(beam_mh)
 #define BEAM_NUD LOCAL(beam_nud)
 
+// Infer channel from x thread
+ __device__ __forceinline__ int thread_chan()
+    { return threadIdx.x >> 2; }
+
 // Infer polarisation from x thread
 __device__ __forceinline__ int ebeam_pol()
     { return threadIdx.x & 0x3; }
 
-// Infer channel from x thread
-__device__ __forceinline__ int thread_chan()
-    { return threadIdx.x >> 2; }
 
 template <
     typename T,
     typename Tr=montblanc::kernel_traits<T>,
-    typename Po=montblanc::kernel_policies<T> >
+    typename Po=montblanc::kernel_policies<T> > 
 __device__ __forceinline__
 void trilinear_interpolate(
     typename Tr::ct & pol_sum,
@@ -105,8 +106,31 @@
     const T & gchan,
     const T & weight)
 {
-    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL
-        + ebeam_pol();
+    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan))*NPOL +ebeam_pol();
+
+    // Perhaps unnecessary as long as BLOCKDIMX is 32
+    typename Tr::ct pol = cub::ThreadLoad<cub::LOAD_LDG>(E_beam + i);
+    pol_sum.x += weight*pol.x;
+    pol_sum.y += weight*pol.y;
+    abs_sum += weight*Po::abs(pol);
+}
+
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> > 
+__device__ __forceinline__
+void trilinear_interpolate_J1(
+    typename Tr::ct & pol_sum,
+    typename Tr::ft & abs_sum,
+    typename Tr::ct * E_beam,
+    const T & gl,
+    const T & gm,
+    const T & gchan,
+    const T & weight)
+{
+    int i = ((int(gl)*BEAM_MH + int(gm))*BEAM_NUD + int(gchan));
 
     // Perhaps unnecessary as long as BLOCKDIMX is 32
     typename Tr::ct pol = cub::ThreadLoad<cub::LOAD_LDG>(E_beam + i);
@@ -115,6 +139,7 @@
     abs_sum += weight*Po::abs(pol);
 }
 
+
 template <typename T> class EBeamTraits {};
 
 template <> class EBeamTraits<float>
@@ -343,6 +368,216 @@
     }
 }
 
+
+
+template <
+    typename T,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void rime_jones1_E_beam_impl(
+    typename EBeamTraits<T>::LMType * lm,
+    typename EBeamTraits<T>::ParallacticAngleType * parallactic_angles,
+    typename EBeamTraits<T>::PointErrorType * point_errors,
+    typename EBeamTraits<T>::AntennaScaleType * antenna_scaling,
+    typename Tr::ft * frequency,
+    typename Tr::ct * E_beam,
+    typename Tr::ct * jones,
+    T beam_ll, T beam_lm, T beam_lfreq,
+    T beam_ul, T beam_um, T beam_ufreq)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int ANT = blockIdx.y*blockDim.y + threadIdx.y;
+    int TIME = blockIdx.z*blockDim.z + threadIdx.z;
+
+    typedef typename EBeamTraits<T>::LMType LMType;
+    typedef typename EBeamTraits<T>::PointErrorType PointErrorType;
+    typedef typename EBeamTraits<T>::AntennaScaleType AntennaScaleType;
+
+    if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan))
+        return;
+
+    __shared__ struct {
+        T lscale;             // l axis scaling factor
+        T mscale;             // m axis scaling factor
+        T pa_sin[BLOCKDIMZ][BLOCKDIMY];  // sin of parallactic angle
+        T pa_cos[BLOCKDIMZ][BLOCKDIMY];  // cos of parallactic angle
+        T gchan0[BLOCKDIMX];  // channel grid position (snapped)
+        T gchan1[BLOCKDIMX];  // channel grid position (snapped)
+        T chd[BLOCKDIMX];    // difference between gchan0 and actual grid position
+        PointErrorType pe[BLOCKDIMZ][BLOCKDIMY][BLOCKDIMX];  // pointing errors
+        AntennaScaleType as[BLOCKDIMY][BLOCKDIMX];           // antenna scaling
+    } shared;
+
+
+    int i;
+
+    // Precompute l and m scaling factors in shared memory
+    if(threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+    {
+        shared.lscale = T(BEAM_LW - 1) / (beam_ul - beam_ll);
+        shared.mscale = T(BEAM_MH - 1) / (beam_um - beam_lm);
+    }
+
+    // Pointing errors vary by time, antenna and channel,
+    i = (TIME*NA + ANT)*NCHAN + POLCHAN;
+    shared.pe[threadIdx.z][threadIdx.y][threadIdx.x] = point_errors[i];
+
+    // Antenna scaling factors vary by antenna and channel,
+    // but not timestep
+    if(threadIdx.z == 0)
+    {
+        i = ANT*NCHAN + POLCHAN;
+        shared.as[threadIdx.y][threadIdx.x] = antenna_scaling[i];
+    }
+
+    // Frequency vary by channel, but not timestep or antenna
+    if(threadIdx.z == 0 && threadIdx.y == 0)
+    {
+        // Channel coordinate
+        // channel grid position
+        T freqscale = T(BEAM_NUD-1) / (beam_ufreq - beam_lfreq);
+        T chan = freqscale * (frequency[POLCHAN] - beam_lfreq);
+        // clamp to grid edges
+        chan = Po::clamp(chan, 0.0, BEAM_NUD-1);
+        // Snap to grid coordinate
+        shared.gchan0[threadIdx.x] = Po::floor(chan);
+        shared.gchan1[threadIdx.x] = Po::min(
+            shared.gchan0[threadIdx.x] + 1.0, BEAM_NUD-1);
+        // Offset of snapped coordinate from grid position
+        shared.chd[threadIdx.x] = chan - shared.gchan0[threadIdx.x];
+    }
+
+    // Parallactic angles vary by time and antenna, but not channel
+    if(threadIdx.x == 0)
+    {
+        i = TIME*NA + ANT;
+        T parangle = parallactic_angles[i];
+        Po::sincos(parangle,
+            &shared.pa_sin[threadIdx.z][threadIdx.y],
+            &shared.pa_cos[threadIdx.z][threadIdx.y]);
+    }
+
+    __syncthreads();
+
+    // Loop over sources
+    for(int SRC=0; SRC < DEXT(nsrc); ++SRC)
+    {
+        // lm coordinate for this source
+        LMType rlm = lm[SRC];
+
+        // L coordinate
+        // Rotate
+        T l = rlm.x*shared.pa_cos[threadIdx.z][threadIdx.y] -
+            rlm.y*shared.pa_sin[threadIdx.z][threadIdx.y];
+        // Add the pointing errors for this antenna.
+        l += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].x;
+        // Scale by antenna scaling factors
+        l *= shared.as[threadIdx.y][threadIdx.x].x;
+        // l grid position
+        l = shared.lscale * (l - beam_ll);
+        // clamp to grid edges
+        l = Po::clamp(0.0, l, BEAM_LW-1);
+        // Snap to grid coordinate
+        T gl0 = Po::floor(l);
+        T gl1 = Po::min(gl0 + 1.0, BEAM_LW-1);
+        // Offset of snapped coordinate from grid position
+        T ld = l - gl0;
+
+        // M coordinate
+        // rotate
+        T m = rlm.x*shared.pa_sin[threadIdx.z][threadIdx.y] +
+            rlm.y*shared.pa_cos[threadIdx.z][threadIdx.y];
+        // Add the pointing errors for this antenna.
+        m += shared.pe[threadIdx.z][threadIdx.y][threadIdx.x].y;
+        // Scale by antenna scaling factors
+        m *= shared.as[threadIdx.y][threadIdx.x].y;
+        // m grid position
+        m = shared.mscale * (m - beam_lm);
+        // clamp to grid edges
+        m = Po::clamp(0.0, m, BEAM_MH-1);
+        // Snap to grid position
+        T gm0 = Po::floor(m);
+        T gm1 = Po::min(gm0 + 1.0, BEAM_MH-1);
+        // Offset of snapped coordinate from grid position
+        T md = m - gm0;
+
+        typename Tr::ct pol_sum = Po::make_ct(0.0, 0.0);
+        typename Tr::ft abs_sum = T(0.0);
+
+        // A simplified trilinear weighting is used here. Given
+        // point x between points x1 and x2, with function f
+        // provided values f(x1) and f(x2) at these points.
+        //
+        // x1 ------- x ---------- x2
+        //
+        // Then, the value of f can be approximated using the following:
+        // f(x) ~= f(x1)(x2-x)/(x2-x1) + f(x2)(x-x1)/(x2-x1)
+        //
+        // Note how the value f(x1) is weighted with the distance
+        // from the opposite point (x2-x).
+        //
+        // As we are interpolating on a grid, we have the following
+        // 1. (x2 - x1) == 1
+        // 2. (x - x1)  == 1 - 1 + (x - x1)
+        //              == 1 - (x2 - x1) + (x - x1)
+        //              == 1 - (x2 - x)
+        // 2. (x2 - x)  == 1 - 1 + (x2 - x)
+        //              == 1 - (x2 - x1) + (x2 - x)
+        //              == 1 - (x - x1)
+        //
+        // Extending the above to 3D, we have
+        // f(x,y,z) ~= f(x1,y1,z1)(x2-x)(y2-y)(z2-z) + ...
+        //           + f(x2,y2,z2)(x-x1)(y-y1)(z-z1)
+        //
+        // f(x,y,z) ~= f(x1,y1,z1)(1-(x-x1))(1-(y-y1))(1-(z-z1)) + ...
+        //           + f(x2,y2,z2)   (x-x1)    (y-y1)    (z-z1)
+
+        // Load in the complex values from the E beam
+        // at the supplied coordinate offsets.
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm0, shared.gchan0[threadIdx.x],
+            (1.0f-ld)*(1.0f-md)*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm0, shared.gchan0[threadIdx.x],
+            ld*(1.0f-md)*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm1, shared.gchan0[threadIdx.x],
+            (1.0f-ld)*md*(1.0f-shared.chd[threadIdx.x]));
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm1, shared.gchan0[threadIdx.x],
+            ld*md*(1.0f-shared.chd[threadIdx.x]));
+
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm0, shared.gchan1[threadIdx.x],
+            (1.0f-ld)*(1.0f-md)*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm0, shared.gchan1[threadIdx.x],
+            ld*(1.0f-md)*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl0, gm1, shared.gchan1[threadIdx.x],
+            (1.0f-ld)*md*shared.chd[threadIdx.x]);
+        trilinear_interpolate_J1<T>(pol_sum, abs_sum, E_beam,
+            gl1, gm1, shared.gchan1[threadIdx.x],
+            ld*md*shared.chd[threadIdx.x]);
+
+        // Normalise the angle and multiply in the absolute sum
+        typename Tr::ft norm = Po::rsqrt(pol_sum.x*pol_sum.x + pol_sum.y*pol_sum.y);
+        if(!::isfinite(norm))
+            { norm = 1.0; }
+
+        pol_sum.x *= norm * abs_sum;
+        pol_sum.y *= norm * abs_sum;
+
+        i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
+        jones[i] = pol_sum;
+    }
+}
+
+
+
+
+
 extern "C" {
 
 #define stamp_jones_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \
@@ -365,6 +600,26 @@
         beam_ul, beam_um, beam_ufreq); \
 }
 
+#define stamp_jones1_E_beam_fn(ft,ct,lm_type,pa_type,pe_type,as_type) \
+__global__ void \
+rime_jones1_E_beam_ ## ft( \
+    lm_type * lm, \
+    pa_type * parallactic_angles, \
+    pe_type * point_errors, \
+    as_type * antenna_scaling, \
+    ft * frequency, \
+    ct * E_beam, \
+    ct * jones, \
+    ft beam_ll, ft beam_lm, ft beam_lfreq, \
+    ft beam_ul, ft beam_um, ft beam_ufreq) \
+{ \
+    rime_jones1_E_beam_impl<ft>( \
+        lm, parallactic_angles, point_errors, \
+        antenna_scaling, frequency, E_beam, jones, \
+        beam_ll, beam_lm, beam_lfreq, \
+        beam_ul, beam_um, beam_ufreq); \
+}
+
 ${stamp_function}
 
 } // extern "C" {
@@ -376,7 +631,7 @@ def __init__(self):
 
     def initialise(self, solver, stream=None):
         slvr = solver
-        ntime, na, npolchan = slvr.dim_local_size('ntime', 'na', 'npolchan')
+        ntime, na, npolchan, npol = slvr.dim_local_size('ntime', 'na', 'npolchan', 'npol')
 
         # Get a property dictionary off the solver
         D = slvr.template_dict()
@@ -400,12 +655,20 @@ def initialise(self, solver, stream=None):
             'float' if slvr.is_float() else 'double',
             'float2' if slvr.is_float() else 'double2',
             'float2' if slvr.is_float() else 'double2'])
-        stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')'])
-        D['stamp_function'] = stamp_fn
 
-        kname = 'rime_jones_E_beam_float' \
-            if slvr.is_float() is True else \
-            'rime_jones_E_beam_double'
+        if npol == 4:
+            stamp_fn = ''.join(['stamp_jones_E_beam_fn(', stamp_args, ')'])
+            D['stamp_function'] = stamp_fn
+            kname = 'rime_jones_E_beam_float' \
+                if slvr.is_float() is True else \
+                'rime_jones_E_beam_double'
+
+        if npol == 1:
+            stamp_fn = ''.join(['stamp_jones1_E_beam_fn(', stamp_args, ')'])
+            D['stamp_function'] = stamp_fn
+            kname = 'rime_jones1_E_beam_float' \
+                if slvr.is_float() is True else \
+                'rime_jones1_E_beam_double'
 
         kernel_string = KERNEL_TEMPLATE.substitute(**D)
 
diff --git a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
index ba802f801..325d0800a 100644
--- a/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
+++ b/montblanc/impl/rime/v4/gpu/RimeEKBSqrt.py
@@ -110,7 +110,6 @@ class EKBTraits<double> {
     int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
     int ANT = blockIdx.y*blockDim.y + threadIdx.y;
     int TIME = blockIdx.z*blockDim.z + threadIdx.z;
-    #define POL (threadIdx.x & 0x3)
 
     if(TIME >= DEXT(ntime) || ANT >= DEXT(na) || POLCHAN >= DEXT(npolchan))
         return;
@@ -134,7 +133,13 @@ class EKBTraits<double> {
 
     // Wavelengths vary by channel, not by time and antenna
     if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+      if (NPOL > 1)
         { freq[threadIdx.x] = frequency[POLCHAN>>2]; }
+      else
+        { freq[threadIdx.x] = frequency[POLCHAN]; }
+    }
+   
 
     __syncthreads();
 
@@ -168,7 +173,11 @@ class EKBTraits<double> {
         i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
         // Load in the E Beam, and multiply it by KB
         typename Tr::ct J = jones[i];
-        montblanc::jones_multiply_4x4_in_place<T>(J, cplx_phase);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_in_place<T>(J, cplx_phase);
+        else
+          montblanc::complex_multiply_in_place<T>(J, cplx_phase);
+       
 
         // Write out the jones matrices
         i = ((SRC*NTIME + TIME)*NA + ANT)*NPOLCHAN + POLCHAN;
diff --git a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
index 991f208fc..049554af7 100644
--- a/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
+++ b/montblanc/impl/rime/v4/gpu/RimeSumCoherencies.py
@@ -174,7 +174,12 @@ class SumCohTraits<double> {
     // TODO uses 4 times the actually required space, since
     // we don't need to store a frequency per polarisation
     if(threadIdx.y == 0 && threadIdx.z == 0)
+    {
+      if (NPOL > 1)
         { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+      else
+        { shared.freq[threadIdx.x] = frequency[POLCHAN]; }
+    }
 
     // We process sources in batches, accumulating visibility values.
     // Visibilities are read in and written out at the start and end
@@ -202,7 +207,10 @@ class SumCohTraits<double> {
         typename Tr::ct ant_one = jones[i];
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+           montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+           montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -248,7 +256,11 @@ class SumCohTraits<double> {
         ant_one.y *= exp;
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
+       
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -298,7 +310,10 @@ class SumCohTraits<double> {
         ant_one.y *= sersic_factor;
         i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
         typename Tr::ct ant_two = jones[i];
-        montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
 
         polsum.x += ant_one.x;
         polsum.y += ant_one.y;
@@ -320,12 +335,20 @@ class SumCohTraits<double> {
     // Multiply the visibility by antenna 1's g term
     i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN;
     typename Tr::ct model_vis = G_term[i];
-    montblanc::jones_multiply_4x4_in_place<T>(model_vis, polsum);
+    if (NPOL > 1)
+        montblanc::jones_multiply_4x4_in_place<T>(model_vis, polsum);
+    else
+        montblanc::complex_multiply_in_place<T>(model_vis, polsum);
+   
 
     // Multiply the visibility by antenna 2's g term
     i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN;
     typename Tr::ct ant2_g_term = G_term[i];
-    montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(model_vis, ant2_g_term);
+    if (NPOL > 1)
+      montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(model_vis, ant2_g_term);
+    else
+      montblanc::complex_conjugate_multiply_in_place<T>(model_vis, ant2_g_term);
+    
 
     // Compute the chi squared sum terms
     i = (TIME*NBL + BL)*NPOLCHAN + POLCHAN;
@@ -367,28 +390,36 @@ class SumCohTraits<double> {
         obs_vis.y *= w;
     }
 
-    // Now, add the real and imaginary components
-    // of each adjacent group of four polarisations
-    // into the first polarisation.
-    typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2);
-
-    // Add polarisations 2 and 3 to 0 and 1
-    if((POLCHAN & 0x3) < 2)
+    if (NPOL > 1)
     {
-        obs_vis.x += other.x;
-        obs_vis.y += other.y;
-    }
-
-    other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1);
-
-    // If this is the polarisation 0, add polarisation 1
-    // and write out this chi squared sum term
-    if((POLCHAN & 0x3) == 0) {
+       // Now, add the real and imaginary components
+       // of each adjacent group of four polarisations
+       // into the first polarisation.
+       typename Tr::ct other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 2);
+
+       // Add polarisations 2 and 3 to 0 and 1
+       if((POLCHAN & 0x3) < 2)
+       {
+          obs_vis.x += other.x;
+          obs_vis.y += other.y;
+       }
+
+       other = cub::ShuffleIndex(obs_vis, cub::LaneId() + 1);
+
+       // If this is the polarisation 0, add polarisation 1
+       // and write out this chi squared sum term
+      if((POLCHAN & 0x3) == 0) {
         obs_vis.x += other.x;
         obs_vis.y += other.y;
 
         i = (TIME*NBL + BL)*NCHAN + (POLCHAN >> 2);
         chi_sqrd_result[i] = obs_vis.x + obs_vis.y;
+      }
+    }
+    else
+    {
+      i = (TIME*NBL + BL)*NCHAN + POLCHAN;
+      chi_sqrd_result[i] = obs_vis.x + obs_vis.y;
     }
 }
 
diff --git a/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
new file mode 100644
index 000000000..f1389cb01
--- /dev/null
+++ b/montblanc/impl/rime/v4/gpu/SersicChiSquaredGradient.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi, Simon Perkins
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+import string
+
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+from pycuda.compiler import SourceModule
+
+import montblanc
+import montblanc.util as mbu
+from montblanc.node import Node
+
+
+from montblanc.config import RimeSolverConfig as Options
+
+FLOAT_PARAMS = {
+    'BLOCKDIMX': 4,    # Number of channels*4 polarisations
+    'BLOCKDIMY': 8,      # Number of baselines
+    'BLOCKDIMZ': 32,     # Number of timesteps
+    'maxregs': 48       # Maximum number of registers
+}
+
+DOUBLE_PARAMS = {
+    'BLOCKDIMX': 4,    # Number of channels*4 polarisations
+    'BLOCKDIMY': 8,     # Number of baselines
+    'BLOCKDIMZ': 32,     # Number of timesteps
+    'maxregs': 63       # Maximum number of registers
+}
+
+KERNEL_TEMPLATE = string.Template("""
+#include <stdint.h>
+#include <cstdio>
+#include \"math_constants.h\"
+//#include <cub/block/block_reduce.cuh>
+#include <montblanc/include/abstraction.cuh>
+#include <montblanc/include/jones.cuh>
+
+#define BLOCKDIMX ${BLOCKDIMX}
+#define BLOCKDIMY ${BLOCKDIMY}
+#define BLOCKDIMZ ${BLOCKDIMZ}
+
+#define TWO_PI_OVER_C ${two_pi_over_c}
+
+template <typename T>
+class SumCohTraits {};
+
+template <>
+class SumCohTraits<float> {
+public:
+    typedef float3 UVWType;
+};
+
+template <>
+class SumCohTraits<double> {
+public:
+    typedef double3 UVWType;
+};
+
+// Here, the definition of the
+// rime_const_data struct
+// is inserted into the template
+// An area of constant memory
+// containing an instance of this
+// structure is declared. 
+${rime_const_data_struct}
+__constant__ rime_const_data C;
+#define LEXT(name) C.name.lower_extent
+#define UEXT(name) C.name.upper_extent
+#define DEXT(name) (C.name.upper_extent - C.name.lower_extent)
+ 
+#define NA (C.na.local_size)
+#define NBL (C.nbl.local_size)
+#define NCHAN (C.nchan.local_size)
+#define NTIME (C.ntime.local_size)
+#define NPSRC (C.npsrc.local_size)
+#define NGSRC (C.ngsrc.local_size)
+#define NSSRC (C.nssrc.local_size)
+#define NSRC (C.nnsrc.local_size)
+#define NPOL (C.npol.local_size)
+#define NPOLCHAN (C.npolchan.local_size)
+
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                             (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+
+
+template <
+    typename T,
+    bool apply_weights=false,
+    typename Tr=montblanc::kernel_traits<T>,
+    typename Po=montblanc::kernel_policies<T> >
+__device__
+void sersic_chi_squared_gradient_impl(
+    typename SumCohTraits<T>::UVWType * uvw,
+    typename Tr::ft * sersic_shape,
+    typename Tr::ft * frequency,
+    int * antenna1,
+    int * antenna2,
+    typename Tr::ct * jones,
+    uint8_t * flag,
+    typename Tr::ft * weight_vector,
+    typename Tr::ct * observed_vis,
+    typename Tr::ct * G_term,
+    typename Tr::ct * visibilities,
+    typename Tr::ft * X2_grad)
+{
+    int POLCHAN = blockIdx.x*blockDim.x + threadIdx.x;
+    int BL = blockIdx.y*blockDim.y + threadIdx.y;
+    int TIME = blockIdx.z*blockDim.z + threadIdx.z;
+ 
+    // --- Specialize BlockReduce for type float. 
+    typedef cub::BlockReduce<T, BLOCKDIMX,cub::BLOCK_REDUCE_WARP_REDUCTIONS,BLOCKDIMY,BLOCKDIMZ> BlockReduceT; 
+
+    // --- Allocate temporary storage in shared memory 
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    __shared__ struct {
+        typename SumCohTraits<T>::UVWType uvw[BLOCKDIMZ][BLOCKDIMY];
+
+        T e1;
+        T e2;
+        T sersic_scale;
+
+        T X2_grad_part[3];
+
+        T freq[BLOCKDIMX];
+
+    } shared;
+
+    int i, ANT1, ANT2;
+    typename Tr::ct dev_vis[3];
+    dev_vis[0].x = 0.; dev_vis[1].x = 0; dev_vis[2].x = 0;
+    dev_vis[0].y = 0.; dev_vis[1].y = 0; dev_vis[2].y = 0;
+    typename Tr::ct delta;
+    delta.x = 0.; delta.y = 0.;
+
+    if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan))
+    {
+
+      T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
+      T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
+      T & W = shared.uvw[threadIdx.z][threadIdx.y].z; 
+    
+      // Figure out the antenna pairs
+      i = TIME*NBL + BL;   
+      ANT1 = antenna1[i];
+      ANT2 = antenna2[i];
+  
+      // UVW coordinates vary by baseline and time, but not polarised channel
+      if(threadIdx.x == 0)
+      {
+        // UVW, calculated from u_pq = u_p - u_q
+        i = TIME*NA + ANT1;
+        shared.uvw[threadIdx.z][threadIdx.y] = uvw[i];
+
+        i = TIME*NA + ANT2;
+        typename SumCohTraits<T>::UVWType ant2_uvw = uvw[i];
+        U -= ant2_uvw.x;
+        V -= ant2_uvw.y;
+        W -= ant2_uvw.z;
+      }
+
+      // Wavelength varies by channel, but not baseline and time
+      // TODO uses 4 times the actually required space, since
+      // we don't need to store a frequency per polarisation
+      if(threadIdx.y == 0 && threadIdx.z == 0)
+      {        
+        if (NPOL >1)
+          { shared.freq[threadIdx.x] = frequency[POLCHAN >> 2]; }
+        else
+          { shared.freq[threadIdx.x] = frequency[POLCHAN]; }
+      }
+
+
+      i = (TIME*NBL +BL)* NPOLCHAN + POLCHAN;
+      delta = observed_vis[i];
+      delta.x -= visibilities[i].x;
+      delta.y -= visibilities[i].y;
+
+      // Zero the polarisation if it is flagged
+      if(flag[i] > 0)
+      {
+        delta.x = 0; delta.y = 0;
+      }
+
+      // Apply any necessary weighting factors
+      if(apply_weights)
+      {
+        T w = weight_vector[i];
+        delta.x *= w;
+        delta.y *= w;
+      }
+    }
+    int SRC_START = DEXT(npsrc) + DEXT(ngsrc);
+    int SRC_STOP = SRC_START + DEXT(nssrc);
+
+    // Loop over Sersic Sources
+    for(int SRC = SRC_START; SRC < SRC_STOP; ++SRC)
+    {
+
+      // sersic shape only varies by source. Shape parameters
+      // thus apply to the entire block and we can load them with
+      // only the first thread.
+      if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
+      {
+          i = SRC - SRC_START;                 shared.e1 = sersic_shape[i];
+          i += DEXT(nssrc);                    shared.e2 = sersic_shape[i];
+          i += DEXT(nssrc);                    shared.sersic_scale = sersic_shape[i];
+      }
+      __syncthreads();
+
+      if (TIME < DEXT(ntime) && BL < DEXT(nbl) && POLCHAN < DEXT(npolchan))
+      {
+
+        // Create references to a
+        // complicated part of shared memory
+        const T & U = shared.uvw[threadIdx.z][threadIdx.y].x; 
+        const T & V = shared.uvw[threadIdx.z][threadIdx.y].y; 
+
+        // sersic source in  the Fourier domain
+        T u1 = U*(T(1.0)+shared.e1) + V*shared.e2;
+        T v1 = U*shared.e2 + V*(T(1.0)-shared.e1);
+
+        T sersic_factor = T(1.0)-shared.e1*shared.e1-shared.e2*shared.e2;
+        T dev_e1 = u1*(U*sersic_factor + 2*shared.e1*u1);
+        dev_e1 += v1*(2*shared.e1*v1 - V*sersic_factor);
+        T dev_e2 = u1*(V*sersic_factor + 2*shared.e2*u1);
+        dev_e2 += v1*(U*sersic_factor + 2*shared.e2*v1);
+
+        // pay attention to the instructions order because temp and sersic_factor variables are utilised twice!!!
+        T temp = T(TWO_PI_OVER_C)*shared.freq[threadIdx.x]*shared.sersic_scale;
+        temp /= sersic_factor;
+        temp *= temp;
+        dev_e1 *= temp/sersic_factor;
+        dev_e2 *= temp/sersic_factor;     
+   
+        temp *= (u1*u1+v1*v1);
+        sersic_factor = T(1.0)+temp;
+        dev_e1 /= sersic_factor;
+        dev_e2 /= sersic_factor;
+
+        T dev_scale = temp/(shared.sersic_scale*sersic_factor);
+
+        // Get the complex scalars for antenna two and multiply
+        // in the exponent term
+        // Get the complex scalar for antenna one and conjugate it
+        i = ((SRC*NTIME + TIME)*NA + ANT1)*NPOLCHAN + POLCHAN;
+        typename Tr::ct ant_one = jones[i];
+        sersic_factor = T(1.0) / (sersic_factor*Po::sqrt(sersic_factor));
+        ant_one.x *= sersic_factor;
+        ant_one.y *= sersic_factor;
+        i = ((SRC*NTIME + TIME)*NA + ANT2)*NPOLCHAN + POLCHAN;
+        typename Tr::ct ant_two = jones[i];
+        if (NPOL > 1)
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant_one, ant_two);
+        else
+          montblanc::complex_conjugate_multiply_in_place<T>(ant_one, ant_two);
+       
+
+        dev_vis[0].x = ant_one.x*dev_e1;
+        dev_vis[0].y = ant_one.y*dev_e1;
+        dev_vis[1].x = ant_one.x*dev_e2;
+        dev_vis[1].y = ant_one.y*dev_e2;
+        dev_vis[2].x = ant_one.x*dev_scale;
+        dev_vis[2].y = ant_one.y*dev_scale;
+  /* 
+        for (int p = 0; p < 3; p++)
+        {
+          // Multiply the visibility derivative by antenna 1's g term
+          i = (TIME*NA + ANT1)*NPOLCHAN + POLCHAN;
+          typename Tr::ct ant1_g_term = G_term[i];
+          montblanc::jones_multiply_4x4_in_place<T>(ant1_g_term, dev_vis[p]);
+
+          // Multiply the visibility by antenna 2's g term
+          i = (TIME*NA + ANT2)*NPOLCHAN + POLCHAN;
+          typename Tr::ct ant2_g_term = G_term[i];
+          montblanc::jones_multiply_4x4_hermitian_transpose_in_place<T>(ant1_g_term, ant2_g_term);    
+          dev_vis[p] = ant1_g_term;
+        }
+   */
+      }   
+      // Write partial derivative with respect to sersic parameters
+      for (int p=0; p<3; p++)
+      {
+        dev_vis[p].x *= delta.x;
+        dev_vis[p].y *= delta.y;
+            
+        dev_vis[p].x += dev_vis[p].y;
+        dev_vis[p].x *= 6;
+         
+        __syncthreads();
+
+         // block reduction, result available only in the first thread
+         T aggregate = BlockReduceT(temp_storage).Sum(dev_vis[p].x);
+          
+         if(threadIdx.z == 0 and threadIdx.y == 0 && threadIdx.x == 0)
+                 shared.X2_grad_part[p] = aggregate;
+      }
+        
+      __syncthreads();
+      //atomic addition to avoid concurrent access in the device memory (contribution for a single particle)
+      // 3 different threads writes each a different component to avoid serialisation
+      if (threadIdx.x == 0 && threadIdx.y < 3 && threadIdx.z == 0)
+      {
+          i = SRC - SRC_START + threadIdx.y*DEXT(nssrc);
+          atomicAdd(&(X2_grad[i]), shared.X2_grad_part[threadIdx.y]);
+      }
+    }
+  
+}
+
+extern "C" {
+
+// Macro that stamps out different kernels, depending
+// on whether we're handling floats or doubles
+// Arguments
+// - ft: The floating point type. Should be float/double.
+// - ct: The complex type. Should be float2/double2.
+// - apply_weights: boolean indicating whether we're weighting our visibilities
+
+#define stamp_sersic_chi_squared_gradient_fn(ft, ct, uvwt, apply_weights) \
+__global__ void \
+sersic_chi_squared_gradient( \
+    uvwt * uvw, \
+    ft * sersic_shape, \
+    ft * frequency, \
+    int * antenna1, \
+    int * antenna2, \
+    ct * jones, \
+    uint8_t * flag,\
+    ft * weight_vector, \
+    ct * observed_vis, \
+    ct * G_term, \
+    ct * visibilities, \
+    ft * X2_grad) \
+{ \
+    sersic_chi_squared_gradient_impl<ft, apply_weights>(uvw, sersic_shape, \
+        frequency, antenna1, antenna2, jones, flag,\
+        weight_vector, observed_vis, G_term, \
+        visibilities, X2_grad); \
+}
+
+${stamp_function}
+
+} // extern "C" {
+""")
+
+class SersicChiSquaredGradient(Node):
+    def __init__(self, weight_vector=False):
+        super(SersicChiSquaredGradient, self).__init__()
+        self.weight_vector = weight_vector
+
+    def initialise(self, solver, stream=None):
+        slvr = solver
+        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
+
+          # Get a property dictionary off the solver
+        D = slvr.template_dict()
+        # Include our kernel parameters
+        D.update(FLOAT_PARAMS if slvr.is_float() else DOUBLE_PARAMS)
+        D['rime_const_data_struct'] = slvr.const_data().string_def()
+
+        D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'] = \
+            mbu.redistribute_threads(D['BLOCKDIMX'], D['BLOCKDIMY'], D['BLOCKDIMZ'],
+            npolchan, nbl, ntime)
+
+        regs = str(FLOAT_PARAMS['maxregs'] \
+            if slvr.is_float() else DOUBLE_PARAMS['maxregs'])
+
+        # Create the signature of the call to the function stamping macro
+        stamp_args = ', '.join([
+            'float' if slvr.is_float() else 'double',
+            'float2' if slvr.is_float() else 'double2',
+            'float3' if slvr.is_float() else 'double3',
+            'true' if slvr.use_weight_vector() else 'false'])
+        stamp_fn = ''.join(['stamp_sersic_chi_squared_gradient_fn(', stamp_args, ')'])
+        D['stamp_function'] = stamp_fn
+
+        kname = 'sersic_chi_squared_gradient'
+
+        self.mod = SourceModule(
+            KERNEL_TEMPLATE.substitute(**D),
+            options=['-lineinfo','-maxrregcount', regs],
+            include_dirs=[montblanc.get_source_path()],
+            no_extern_c=True)
+
+        self.rime_const_data = self.mod.get_global('C')
+        self.kernel = self.mod.get_function(kname)
+        self.launch_params = self.get_launch_params(slvr, D)
+
+    def shutdown(self, solver, stream=None):
+        pass
+
+    def pre_execution(self, solver, stream=None):
+        pass
+
+    def get_launch_params(self, slvr, D):
+        polchans_per_block = D['BLOCKDIMX']
+        bl_per_block = D['BLOCKDIMY']
+        times_per_block = D['BLOCKDIMZ']
+
+        ntime, nbl, npolchan = slvr.dim_local_size('ntime', 'nbl', 'npolchan')
+        polchan_blocks = mbu.blocks_required(npolchan, polchans_per_block)
+        bl_blocks = mbu.blocks_required(nbl, bl_per_block)
+        time_blocks = mbu.blocks_required(ntime, times_per_block)
+
+        return {
+            'block' : (polchans_per_block, bl_per_block, times_per_block),
+            'grid'  : (polchan_blocks, bl_blocks, time_blocks),
+        }
+
+    def execute(self, solver, stream=None):
+        slvr = solver
+
+        if stream is not None:
+            cuda.memcpy_htod_async(
+                self.rime_const_data[0],
+                slvr.const_data().ndary(),
+                stream=stream)
+        else:
+            cuda.memcpy_htod(
+                self.rime_const_data[0],
+                slvr.const_data().ndary())
+
+        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
+            else slvr.sersic_shape
+
+        nssrc = slvr.dim_local_size('nssrc') 
+        if slvr.is_float():
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float32)
+        else:
+            self.gradient = gpuarray.zeros((3,nssrc,),dtype=np.float64)
+
+        self.kernel(slvr.uvw, sersic,
+            slvr.frequency, slvr.antenna1, slvr.antenna2,
+            slvr.jones, slvr.flag, slvr.weight_vector,
+            slvr.observed_vis, slvr.G_term,
+            slvr.model_vis, self.gradient,
+            stream=stream, **self.launch_params)
+
+        slvr.X2_grad = self.gradient.get() 
+
+        if not self.weight_vector:
+            slvr.X2_grad = slvr.X2_grad/slvr.sigma_sqrd
+
+    def post_execution(self, solver, stream=None):
+        pass
diff --git a/montblanc/impl/rime/v4/loaders/loaders.py b/montblanc/impl/rime/v4/loaders/loaders.py
index eaabc062e..63b1c933f 100644
--- a/montblanc/impl/rime/v4/loaders/loaders.py
+++ b/montblanc/impl/rime/v4/loaders/loaders.py
@@ -250,4 +250,4 @@ def __enter__(solver):
         return super(MeasurementSetLoader,solver).__enter__()
 
     def __exit__(solver, type, value, traceback):
-        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
\ No newline at end of file
+        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
diff --git a/montblanc/impl/rime/v5/CompositeRimeSolver.py b/montblanc/impl/rime/v5/CompositeRimeSolver.py
index f717dd247..47d9b98ee 100644
--- a/montblanc/impl/rime/v5/CompositeRimeSolver.py
+++ b/montblanc/impl/rime/v5/CompositeRimeSolver.py
@@ -133,7 +133,9 @@ def __init__(self, slvr_cfg):
         # before throttling is applied
         self.throttle_factor = slvr_cfg.get(
             Options.VISIBILITY_THROTTLE_FACTOR)
-
+        # Enabling chi squared gradient computation
+        self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT)
+     
         # Massage the contexts for each device into a list
         if not isinstance(self.dev_ctxs, list):
             self.dev_ctxs = [self.dev_ctxs]
@@ -179,6 +181,7 @@ def __init__(self, slvr_cfg):
                     key=lambda T: T[1])
 
         P, M, mem = budgets[0]
+        #P[Options.NSRC] = slvr_cfg.get(Options.SOURCE_BATCH_SIZE)
 
         # Log some information about the memory budget
         # and dimension reduction
@@ -304,6 +307,9 @@ def _gen_vis_slices(self):
         cpu_slice['nvis'] = slice(0, 0, 1)
         gpu_slice['nvis'] = slice(0, 0, 1)
 
+        cpu_slice['npol'] = slice(0, npol, 1)
+        gpu_slice['npol'] = slice(0, npol, 1)
+
         montblanc.log.debug('Generating RIME slices')
 
         # Set up time slicing
@@ -864,11 +870,12 @@ def _thread_prime_memory_pools(self):
 
         # Now force return of memory to the pools
         for key, array_list in pinned_pool_refs.iteritems():
-            [a.base.free() for a in array_list]
+            [a.base.free() for a in array_list]      
 
         for array_list in device_pool_refs.itervalues():
             [a.free() for a in array_list]
 
+
     def _thread_start_solving(self):
         """
         Contains enqueue thread functionality that needs to
@@ -879,6 +886,7 @@ def _thread_start_solving(self):
         # transferred at least once. e.g. the beam cube
         [d.clear() for d in self.thread_local.dirty]
 
+
     def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         """
         Enqueue CUDA memory transfer and kernel execution operations on a CUDA stream.
@@ -970,6 +978,45 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
                 _update_refs(pool_refs, {'cdata_coherencies' : [cdata_ref]})
                 kernel.execute(subslvr, subslvr.stream)
 
+            # Iterate again for gradient computation requiring visibilities computed
+            # for all source chuncks
+            if self.enable_sersic_grad:
+
+                # Get pinned memory to hold the full X2 gradient
+                nssrc = self.dim_local_size('nssrc')
+                sub_X2_grad = subslvr.pinned_mem_pool.allocate(
+                    shape=(3,nssrc), dtype=subslvr.X2_grad.dtype)
+
+                for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices():
+                    # Update our maps with source slice information
+                    cpu_slice_map.update(src_cpu_slice_map)
+                    gpu_slice_map.update(src_gpu_slice_map)
+           
+                    # Configure dimension extents and global size on the sub-solver
+                    for name, slice_ in cpu_slice_map.iteritems():
+                        subslvr.update_dimension(name=name,
+                            global_size=self.dimension(name).global_size,
+                            lower_extent=slice_.start,
+                            upper_extent=slice_.stop)
+
+                    subslvr.X2_grad.fill(0, stream=subslvr.stream)                   
+      
+                    # Enqueue Sersic gradient
+                    kernel = subslvr.rime_sersic_gradient
+                    new_refs = self._enqueue_array(
+                        subslvr, cpu_slice_map, gpu_slice_map,
+                        direction=ASYNC_HTOD, dirty=dirty,
+                        classifiers=[Classifier.COHERENCIES_INPUT])
+                    cdata_ref = self._enqueue_const_data_htod(
+                        subslvr, kernel.rime_const_data[0])
+                    _update_refs(pool_refs, new_refs)
+                    _update_refs(pool_refs, {'cdata_gradient' : [cdata_ref]})
+                    kernel.execute(subslvr, subslvr.stream)
+
+                    # Enqueue the X2 gradient slice opy off the GPU onto the CPU
+                    src_range = src_cpu_slice_map['nssrc']
+                    subslvr.X2_grad.get_async(ary=sub_X2_grad[:,src_range], stream=subslvr.stream)
+
             # Enqueue chi-squared term reduction and return the
             # GPU array allocated to it
             X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream)
@@ -982,7 +1029,7 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
             X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream)
 
             # Enqueue transfer of simulator output (model visibilities) to the CPU
-            sim_output_refs = self._enqueue_array(subslvr,
+            sim_output_refs = self._enqueue_array(subslvr, 
                 cpu_slice_map, gpu_slice_map,
                 direction=ASYNC_DTOH, dirty={},
                 classifiers=[Classifier.SIMULATOR_OUTPUT])
@@ -1005,11 +1052,17 @@ def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map, **kwargs):
         pool_refs['X2_gpu'].append(X2_gpu_ary)
         pool_refs['X2_cpu'].append(sub_X2)
         pool_refs['model_vis_output'].append(model_vis)
-
-        return (sync_event, sub_X2, model_vis,
-            pool_refs, subslvr.pool_lock,
-            cpu_slice_map.copy(),
-            gpu_slice_map.copy())
+        if self.enable_sersic_grad:
+            pool_refs['X2_grad'].append(sub_X2_grad)
+            return (sync_event, sub_X2, model_vis, sub_X2_grad,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
+        else:
+            return (sync_event, sub_X2, model_vis,
+                pool_refs, subslvr.pool_lock,
+                cpu_slice_map.copy(),
+                gpu_slice_map.copy())
 
     def initialise(self):
         """ Initialise the sub-solver """
@@ -1062,8 +1115,12 @@ def _sync_wait(future):
             Return a copy of the pinned chi-squared, pinned model visibilities
             and index into the CPU array after synchronizing on the cuda_event
             """
-            cuda_event, pinned_X2, pinned_model_vis, \
-                pool_refs, pool_lock, cpu, gpu = future.result()
+            if self.enable_sersic_grad:
+                cuda_event, pinned_X2, pinned_model_vis, pinned_X2_grad,\
+                    pool_refs, pool_lock, cpu, gpu = future.result()
+            else:
+                cuda_event, pinned_X2, pinned_model_vis, \
+                    pool_refs, pool_lock, cpu, gpu = future.result()
 
             try:
                 cuda_event.synchronize()
@@ -1090,10 +1147,15 @@ def _sync_wait(future):
 
             X2 = pinned_X2.copy()
             model_vis = pinned_model_vis.copy().reshape(model_vis_shape)
+            if self.enable_sersic_grad:
+                X2_grad = pinned_X2_grad.copy()
 
             _free_pool_allocs(pool_refs, pool_lock)
 
-            return X2, model_vis, model_vis_idx
+            if self.enable_sersic_grad:
+                return X2, model_vis, model_vis_idx, X2_grad
+            else:
+                return X2, model_vis, model_vis_idx
 
         # For easier typing
         C = CompositeRimeSolver
@@ -1111,6 +1173,9 @@ def _sync_wait(future):
 
         # Running sum of the chi-squared values returned in futures
         X2_sum = self.ft(0.0)
+        if self.enable_sersic_grad:
+            nssrc = self.dim_local_size('nssrc')
+            self.X2_grad = self.ft(np.zeros((3,nssrc)))
 
         # Sets of return value futures for each executor
         value_futures = [set() for ex in self.enqueue_executors]
@@ -1181,8 +1246,12 @@ def _accumulate(model_slice, model_chunk):
                         for s in value_futures:
                             s.discard(f)
 
-                        # Get chi-squared and model visibilities
-                        X2, pinned_model_vis, model_vis_idx = f.result()
+                        # Get chi-squared and model visibilities (and X2 gradient)
+                        if self.enable_sersic_grad:
+                            X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                            self.X2_grad += pinned_X2_grad
+                        else:
+                            X2, pinned_model_vis, model_vis_idx = f.result()
                         # Sum X2
                         X2_sum += X2
                         # Write model visibilities to the numpy array
@@ -1200,8 +1269,12 @@ def _accumulate(model_slice, model_chunk):
 
         # Sum remaining chi-squared
         for f in done:
-            # Get chi-squared and model visibilities
-            X2, pinned_model_vis, model_vis_idx = f.result()
+            # Get chi-squared and model visibilities (and X2 gradient)
+            if self.enable_sersic_grad:
+                X2, pinned_model_vis, model_vis_idx, pinned_X2_grad = f.result()
+                self.X2_grad += pinned_X2_grad
+            else:
+                X2, pinned_model_vis, model_vis_idx = f.result()
             # Sum X2
             X2_sum += X2
             # Write model visibilities to the numpy array
@@ -1213,6 +1286,8 @@ def _accumulate(model_slice, model_chunk):
             self.set_X2(X2_sum)
         else:
             self.set_X2(X2_sum/self.sigma_sqrd)
+            if self.enable_sersic_grad:
+                self.X2_grad = self.X2_grad/self.sigma_sqrd
 
     def shutdown(self):
         """ Shutdown the solver """
diff --git a/montblanc/impl/rime/v5/RimeSolver.py b/montblanc/impl/rime/v5/RimeSolver.py
index 7d7192ee0..32b41d7cf 100644
--- a/montblanc/impl/rime/v5/RimeSolver.py
+++ b/montblanc/impl/rime/v5/RimeSolver.py
@@ -31,6 +31,7 @@
 from montblanc.impl.rime.v5.gpu.RimeEKBSqrt import RimeEKBSqrt
 from montblanc.impl.rime.v5.gpu.RimeSumCoherencies import RimeSumCoherencies
 from montblanc.impl.rime.v5.gpu.RimeReduction import RimeReduction
+from montblanc.impl.rime.v5.gpu.SersicChiSquaredGradient import SersicChiSquaredGradient
 
 from montblanc.impl.rime.v4.RimeSolver import RimeSolver as RimeSolverV4
 
@@ -64,11 +65,14 @@ def __init__(self, slvr_cfg):
             slvr_cfg[Options.E_BEAM_DEPTH],
             description='E cube nu depth')
 
+        self.enable_sersic_grad = slvr_cfg.get(Options.SERSIC_GRADIENT)
+
         self.rime_e_beam = RimeEBeam()
         self.rime_b_sqrt = RimeBSqrt()
         self.rime_ekb_sqrt = RimeEKBSqrt()
         self.rime_sum = RimeSumCoherencies()
         self.rime_reduce = RimeReduction()
+        self.rime_sersic_gradient = SersicChiSquaredGradient()
 
         from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs
         monkey_patch_antenna_pairs(self)
@@ -136,6 +140,8 @@ def initialise(self):
             self.rime_ekb_sqrt.initialise(self)
             self.rime_sum.initialise(self)
             self.rime_reduce.initialise(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.initialise(self)
 
     def solve(self):
         with self.context:
@@ -144,6 +150,8 @@ def solve(self):
             self.rime_ekb_sqrt.execute(self)
             self.rime_sum.execute(self)
             self.rime_reduce.execute(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.execute(self)
 
     def shutdown(self):
         with self.context:
@@ -151,4 +159,6 @@ def shutdown(self):
             self.rime_b_sqrt.shutdown(self)
             self.rime_ekb_sqrt.shutdown(self)
             self.rime_sum.shutdown(self)
-            self.rime_reduce.shutdown(self)
\ No newline at end of file
+            self.rime_reduce.shutdown(self)
+            if self.enable_sersic_grad:
+                self.rime_sersic_gradient.shutdown(self)
diff --git a/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
new file mode 100644
index 000000000..70d5c0445
--- /dev/null
+++ b/montblanc/impl/rime/v5/gpu/SersicChiSquaredGradient.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2016 Marzia Rivi
+#
+# This file is part of montblanc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import numpy as np
+
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+
+import montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient
+
+class SersicChiSquaredGradient(montblanc.impl.rime.v4.gpu.SersicChiSquaredGradient.SersicChiSquaredGradient):
+    def __init__(self):
+        super(SersicChiSquaredGradient, self).__init__()
+    def initialise(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).initialise(solver,stream)
+    def shutdown(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).shutdown(solver,stream)
+    def pre_execution(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).pre_execution(solver,stream)
+
+        if stream is not None:
+            cuda.memcpy_htod_async(
+                self.rime_const_data[0],
+                solver.const_data().ndary(),
+                stream=stream)
+        else:
+            cuda.memcpy_htod(
+                self.rime_const_data[0],
+                solver.const_data().ndary())
+
+    def post_execution(self, solver, stream=None):
+        super(SersicChiSquaredGradient, self).post_execution(solver,stream)
+
+    def execute(self, solver, stream=None):
+        slvr = solver
+
+        sersic = np.intp(0) if np.product(slvr.sersic_shape.shape) == 0 \
+            else slvr.sersic_shape
+
+        self.kernel(slvr.uvw, sersic,
+            slvr.frequency, slvr.antenna1, slvr.antenna2,
+            slvr.jones, slvr.flag, slvr.weight_vector,
+            slvr.observed_vis, slvr.G_term,
+            slvr.model_vis, slvr.X2_grad,
+            stream=stream, **self.launch_params)
diff --git a/montblanc/impl/rime/v5/loaders/loaders.py b/montblanc/impl/rime/v5/loaders/loaders.py
index a35d25848..28545a661 100644
--- a/montblanc/impl/rime/v5/loaders/loaders.py
+++ b/montblanc/impl/rime/v5/loaders/loaders.py
@@ -335,4 +335,4 @@ def __enter__(solver):
         return super(MeasurementSetLoader,solver).__enter__()
 
     def __exit__(solver, type, value, traceback):
-        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)
\ No newline at end of file
+        return super(MeasurementSetLoader,solver).__exit__(type,value,traceback)