diff --git a/doc/source/history.rst b/doc/source/history.rst
index 25336b0..be1ab18 100644
--- a/doc/source/history.rst
+++ b/doc/source/history.rst
@@ -6,8 +6,12 @@ Release history
 0.8.0 (current development version)
 ===================================
 
+* CHANGED: `dest_offset` renamed to `dst_offset` to match `pyopencl`/`pycuda` API.
+
 * FIXED: register ``numpy.bool`` in addition to ``bool`` - it is a separate type now (@perdigao1).
 
+* FIXED: incompatibility with modern versions of `pyopencl`.
+
 
 0.7.6 (20 Nov 2021)
 ===================
diff --git a/reikna/cluda/api.py b/reikna/cluda/api.py
index 6b2bd9f..1ed8dbe 100644
--- a/reikna/cluda/api.py
+++ b/reikna/cluda/api.py
@@ -461,13 +461,13 @@ def from_device(self, arr, dest=None, async_=False):
         """
         raise NotImplementedError()
 
-    def copy_array(self, arr, dest=None, src_offset=0, dest_offset=0, size=None):
+    def copy_array(self, arr, dest=None, src_offset=0, dst_offset=0, size=None):
         """
         Copies array on device.
 
         :param dest: the effect is the same as in :py:meth:`to_device`.
         :param src_offset: offset (in items of ``arr.dtype``) in the source array.
-        :param dest_offset: offset (in items of ``arr.dtype``) in the destination array.
+        :param dst_offset: offset (in items of ``arr.dtype``) in the destination array.
         :param size: how many elements of ``arr.dtype`` to copy.
         """
 
@@ -479,10 +479,10 @@ def copy_array(self, arr, dest=None, src_offset=0, dest_offset=0, size=None):
         itemsize = arr.dtype.itemsize
         nbytes = arr.nbytes if size is None else itemsize * size
         src_offset *= itemsize
-        dest_offset *= itemsize
+        dst_offset *= itemsize
 
         self._copy_array_buffer(arr_device, arr,
-            nbytes, src_offset=src_offset, dest_offset=dest_offset)
+            nbytes, src_offset=src_offset, dst_offset=dst_offset)
         self._synchronize()
 
         if dest is None:
diff --git a/reikna/cluda/cuda.py b/reikna/cluda/cuda.py
index b24ee66..86242ae 100644
--- a/reikna/cluda/cuda.py
+++ b/reikna/cluda/cuda.py
@@ -204,13 +204,13 @@ def from_device(self, arr, dest=None, async_=False):
         if dest is None:
             return arr_cpu
 
-    def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dest_offset=0):
+    def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dst_offset=0):
         self._memcpy_dtod(
-            dest.gpudata, src.gpudata, nbytes, src_offset=src_offset, dest_offset=dest_offset)
+            dest.gpudata, src.gpudata, nbytes, src_offset=src_offset, dst_offset=dst_offset)
 
-    def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dest_offset=0):
+    def _memcpy_dtod(self, dest, src, nbytes, src_offset=0, dst_offset=0):
         cuda.memcpy_dtod_async(
-            int(dest) + dest_offset,
+            int(dest) + dst_offset,
             int(src) + src_offset,
             nbytes, stream=self._queue)
 
diff --git a/reikna/cluda/ocl.py b/reikna/cluda/ocl.py
index 7a85444..eb652af 100644
--- a/reikna/cluda/ocl.py
+++ b/reikna/cluda/ocl.py
@@ -27,10 +27,16 @@ class Array(clarray.Array):
     """
     def __init__(
             self, thr, shape, dtype, strides=None, offset=0, nbytes=None,
-            allocator=None, base_data=None):
+            allocator=None, data=None, events=None,
+            _fast=False, _context=None, _queue=None, _size=None):
+
+        if thr is None:
+            thr = Thread(_queue)
+
         clarray.Array.__init__(
             self, thr._queue, shape, dtype, strides=strides, allocator=allocator,
-            data=base_data, offset=offset)
+            data=data, offset=offset, events=events,
+            _fast=_fast, _context=thr._context, _queue=thr._queue, _size=_size)
         self.nbytes = nbytes
         self.thread = thr
 
@@ -51,7 +57,7 @@ def __getitem__(self, index):
         # Let cl.Array calculate the new strides and offset
         return self.thread.array(
             shape=res.shape, dtype=res.dtype, strides=res.strides,
-            base_data=res.base_data,
+            base=res.base_data,
             offset=res.offset)
 
     def __setitem__(self, index, value):
@@ -88,7 +94,7 @@ def _process_cqd(self, cqd):
 
     def array(
             self, shape, dtype, strides=None, offset=0, nbytes=None,
-            allocator=None, base=None, base_data=None):
+            allocator=None, base=None, data=None):
 
         if allocator is None:
             allocator = self.allocate
@@ -98,14 +104,16 @@ def array(
         if nbytes is None:
             nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset))
 
-        if (offset != 0 or strides is not None) and base_data is None and base is None:
-            base_data = allocator(nbytes)
+        if (offset != 0 or strides is not None) and data is None and base is None:
+            data = allocator(nbytes)
         elif base is not None:
-            base_data = base.data
+            if isinstance(base, Array):
+                base = base.base_data
+            data = base
 
         return Array(
             self, shape, dtype, strides=strides, offset=offset,
-            allocator=allocator, base_data=base_data, nbytes=nbytes)
+            allocator=allocator, data=data, nbytes=nbytes)
 
     def allocate(self, size):
         return cl.Buffer(self._context, cl.mem_flags.READ_WRITE, size=size)
@@ -118,10 +126,10 @@ def from_device(self, arr, dest=None, async_=False):
         if dest is None:
             return arr_cpu
 
-    def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dest_offset=0):
+    def _copy_array_buffer(self, dest, src, nbytes, src_offset=0, dst_offset=0):
         cl.enqueue_copy(
             self._queue, dest.data, src.data,
-            byte_count=nbytes, src_offset=src_offset, dest_offset=dest_offset)
+            byte_count=nbytes, src_offset=src_offset, dst_offset=dst_offset)
 
     def synchronize(self):
         self._queue.finish()
@@ -222,6 +230,6 @@ def prepare(self, global_size, local_size=None, local_mem=0):
         self._global_size = wrap_in_tuple(global_size)
 
     def _prepared_call(self, *args):
-        # Passing base_data, assuming that the kernel knows how to handle the offset and the strides
+        # Passing data, assuming that the kernel knows how to handle the offset and the strides
         args = [x.base_data if isinstance(x, clarray.Array) else x for x in args]
         return self._kernel(self._thr._queue, self._global_size, self._local_size, *args)
diff --git a/test/test_cluda/test_basics.py b/test/test_cluda/test_basics.py
index 53e15bc..0a98731 100644
--- a/test/test_cluda/test_basics.py
+++ b/test/test_cluda/test_basics.py
@@ -376,7 +376,7 @@ def test_offsets_in_kernel(thr):
     """
 
     global_size = 100
-    dest_offset = 4
+    dst_offset = 4
     src_offset = 2
     dtype = dtypes.normalize_type(numpy.int32)
 
@@ -384,15 +384,15 @@ def test_offsets_in_kernel(thr):
         KERNEL void test(GLOBAL_MEM int *dest, GLOBAL_MEM int *src)
         {
             const SIZE_T i = get_global_id(0);
-            dest[i + ${dest_offset}] = src[i + ${src_offset}];
+            dest[i + ${dst_offset}] = src[i + ${src_offset}];
         }
         """,
-        render_kwds=dict(dest_offset=dest_offset, src_offset=src_offset))
+        render_kwds=dict(dst_offset=dst_offset, src_offset=src_offset))
     test = program.test
 
-    dest_dev_base = thr.array(global_size + dest_offset, dtype)
+    dest_dev_base = thr.array(global_size + dst_offset, dtype)
     dest_dev = thr.array(
-        global_size, dtype, offset=dest_offset * dtype.itemsize, base=dest_dev_base)
+        global_size, dtype, offset=dst_offset * dtype.itemsize, base=dest_dev_base)
 
     src_base = numpy.arange(global_size + src_offset).astype(dtype)
     src_dev_base = thr.to_device(src_base)
diff --git a/test/test_core/test_transformation.py b/test/test_core/test_transformation.py
index f024b82..6defb83 100644
--- a/test/test_core/test_transformation.py
+++ b/test/test_core/test_transformation.py
@@ -560,7 +560,7 @@ def test_array_offset(thr):
     # providing base_data
     a3_base = thr.array((arr_len + offset_len,), dtype)
     a3_data = a3_base.base_data
-    a3 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, base_data=a3_data)
+    a3 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, data=a3_data)
 
     fill = PureParallel(
         [