diff --git a/internal/tdp/thunks/repeated.go b/internal/tdp/thunks/repeated.go
index c8b08aa..ae6a600 100644
--- a/internal/tdp/thunks/repeated.go
+++ b/internal/tdp/thunks/repeated.go
@@ -294,7 +294,7 @@ func parsePackedVarint[T tdp.Int](p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		e8 := p.Add(layout.RoundDown(int(e-p), 8))
 		if p < e8 {
 		again:
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & tdp.SignBits)
 			p = p.Add(8)
 			if p < e8 {
@@ -303,7 +303,7 @@ func parsePackedVarint[T tdp.Int](p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		}
 		if p < e {
 			left := int(e - p)
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & (tdp.SignBits >> uint((8-left)*8)))
 		}
 	}
diff --git a/internal/tdp/thunks/singular.go b/internal/tdp/thunks/singular.go
index c59e844..7be4876 100644
--- a/internal/tdp/thunks/singular.go
+++ b/internal/tdp/thunks/singular.go
@@ -294,7 +294,7 @@ func parseFixed[T tdp.Int](p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 	}
 	var p *T
 	p1, p2, p = vm.GetMutableField[T](p1, p2)
-	*p = *xunsafe.Cast[T](p1.PtrAddr.AssertValid())
+	*p = xunsafe.ByteLoad[T](p1.PtrAddr.AssertValid(), 0)
 	p1 = p1.Advance(layout.Size[T]())
 
 	return p1, p2
diff --git a/internal/tdp/thunks/stencils.go b/internal/tdp/thunks/stencils.go
index 91ca73b..5195376 100644
--- a/internal/tdp/thunks/stencils.go
+++ b/internal/tdp/thunks/stencils.go
@@ -8354,7 +8354,7 @@ func parsePackedVarint8(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		e8 := p.Add(layout.RoundDown(int(e-p), 8))
 		if p < e8 {
 		again:
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & tdp.SignBits)
 			p = p.Add(8)
 			if p < e8 {
@@ -8363,7 +8363,7 @@ func parsePackedVarint8(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		}
 		if p < e {
 			left := int(e - p)
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & (tdp.SignBits >> uint((8-left)*8)))
 		}
 	}
@@ -8483,7 +8483,7 @@ func parsePackedVarint32(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		e8 := p.Add(layout.RoundDown(int(e-p), 8))
 		if p < e8 {
 		again:
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & tdp.SignBits)
 			p = p.Add(8)
 			if p < e8 {
@@ -8492,7 +8492,7 @@ func parsePackedVarint32(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		}
 		if p < e {
 			left := int(e - p)
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & (tdp.SignBits >> uint((8-left)*8)))
 		}
 	}
@@ -8612,7 +8612,7 @@ func parsePackedVarint64(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		e8 := p.Add(layout.RoundDown(int(e-p), 8))
 		if p < e8 {
 		again:
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & tdp.SignBits)
 			p = p.Add(8)
 			if p < e8 {
@@ -8621,7 +8621,7 @@ func parsePackedVarint64(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 		}
 		if p < e {
 			left := int(e - p)
-			bytes := *xunsafe.Cast[uint64](p.AssertValid())
+			bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 			count += bits.OnesCount64(bytes & (tdp.SignBits >> uint((8-left)*8)))
 		}
 	}
@@ -8945,7 +8945,7 @@ func parseFixed32(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 	}
 	var p *uint32
 	p1, p2, p = vm.GetMutableField[uint32](p1, p2)
-	*p = *xunsafe.Cast[uint32](p1.PtrAddr.AssertValid())
+	*p = xunsafe.ByteLoad[uint32](p1.PtrAddr.AssertValid(), 0)
 	p1 = p1.Advance(layout.Size[uint32]())
 
 	return p1, p2
@@ -8959,7 +8959,7 @@ func parseFixed64(p1 vm.P1, p2 vm.P2) (vm.P1, vm.P2) {
 	}
 	var p *uint64
 	p1, p2, p = vm.GetMutableField[uint64](p1, p2)
-	*p = *xunsafe.Cast[uint64](p1.PtrAddr.AssertValid())
+	*p = xunsafe.ByteLoad[uint64](p1.PtrAddr.AssertValid(), 0)
 	p1 = p1.Advance(layout.Size[uint64]())
 
 	return p1, p2
diff --git a/internal/tdp/vm/run.go b/internal/tdp/vm/run.go
index 20ad81c..18552c5 100644
--- a/internal/tdp/vm/run.go
+++ b/internal/tdp/vm/run.go
@@ -458,6 +458,7 @@ func skipRecord(p1 P1, p2 P2, depth int) (P1, P2) {
 
 	switch ty {
 	case protowire.VarintType:
+		p1, p2 = p1.AtLeast(p2, 1)
 		p1, p2, _ = p1.Varint(p2)
 	case protowire.BytesType:
 		p1, p2, _ = p1.Bytes(p2)
diff --git a/internal/tdp/vm/utf8.go b/internal/tdp/vm/utf8.go
index 9ea9c67..f2feb97 100644
--- a/internal/tdp/vm/utf8.go
+++ b/internal/tdp/vm/utf8.go
@@ -43,7 +43,7 @@ func verifyUTF8(p1 P1, p2 P2, n int) (P1, P2, zc.Range) {
 	// and a remainder part that only does 0 to 7 bytes.
 	if e8 > p {
 	again:
-		bytes := *xunsafe.Cast[uint64](p.AssertValid())
+		bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 		p = p.Add(8)
 		if bytes&tdp.SignBits != 0 {
 			p = p.Add(-8) // Back up, need to take the slow path.
@@ -56,7 +56,8 @@ func verifyUTF8(p1 P1, p2 P2, n int) (P1, P2, zc.Range) {
 	if e > p {
 		// Fast path for if the last few bytes are also ASCII.
 		left := int(e - p)
-		bytes := *xunsafe.Cast[uint64](p.AssertValid())
+		bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
+
 		p = p.Add(left)
 		if bytes&(tdp.SignBits>>uint((8-left)*8)) != 0 {
 			p = p.Add(-left)
@@ -82,7 +83,7 @@ unicode:
 		n := min(8, int(e-p))
 		// Fast path for ASCII: simply check that all of the bytes don't have
 		// their sign bits set.
-		bytes := *xunsafe.Cast[uint64](p.AssertValid())
+		bytes := xunsafe.ByteLoad[uint64](p.AssertValid(), 0)
 		mask := uint64(tdp.SignBits) >> uint((8-n)*8)
 		ascii := bits.TrailingZeros64(bytes&mask) / 8
 		p1.Log(p2, "ascii bytes", "%016x, %d bytes", bytes, ascii)
@@ -114,7 +115,7 @@ unicode:
 		// Bounds check is complete here. We are free to load four bytes
 		// and mask off what we don't need. We can't re-use bytes here
 		// because the rune might straddle a boundary.
-		raw := *xunsafe.Cast[uint32](p.AssertValid())
+		raw := xunsafe.ByteLoad[uint32](p.AssertValid(), 0)
 		p1.Log(p2, "wide rune bits", "%08b, %d bytes", xunsafe.Bytes(&raw), count)
 
 		// This puts the contents of the first byte into r.
diff --git a/internal/testdata/scalars/utf8.yaml b/internal/testdata/scalars/utf8.yaml
index 7fb9d1b..cacabd3 100644
--- a/internal/testdata/scalars/utf8.yaml
+++ b/internal/testdata/scalars/utf8.yaml
@@ -16,6 +16,17 @@ type: hyperpb.test.Scalars
 protoscope:
 - '14: {`ff`}'
 
+- '14: {"a"}'
+- '14: {"ab"}'
+- '14: {"abc"}'
+- '14: {"abcd"}'
+- '14: {"abcde"}'
+- '14: {"abcdef"}'
+- '14: {"abcdefg"}'
+- '14: {"abcdefgh"}'
+- '14: {"abcdefghi"}'
+- '14: {"abcdefghij"}'
+
 # Various torn runes.
 - '14: {`c280`}'
 - '14: {`c2`}'
diff --git a/internal/testdata/testdata.go b/internal/testdata/testdata.go
index cf410b4..60ad4c5 100644
--- a/internal/testdata/testdata.go
+++ b/internal/testdata/testdata.go
@@ -151,7 +151,10 @@ func (test *TestCase) Run(t *testing.T, ctx *hyperpb.Shared, verbose bool) {
 
 		// Parse using hyperpb.
 		m2 := ctx.NewMessage(test.Type.Fast)
-		err2 := proto.Unmarshal(specimen, m2)
+		err2 := m2.Unmarshal(specimen, hyperpb.WithAllowAlias(true))
+		if err2 == nil {
+			err2 = m2.Initialized()
+		}
 
 		if verbose {
 			t.Logf("theirs: %v, ours: %v", err1, err2)
@@ -263,7 +266,50 @@ func parseTestCase(t testing.TB, path string, file []byte) *TestCase {
 			// making sure we have optimal message placement before we start.
 			test.Specimens[i] = vm.RelocatePageBoundary(test.Specimens[i], false)
 		}
+	} else {
+		// Ensure that each specimen is at the end of an allocation.
+		for n, b := range test.Specimens {
+			test.Specimens[n] = snapBytes(b)
+		}
 	}
 
 	return test
 }
+
+// snapBytes makes a copy of b such that b is at the end of a Go allocation.
+//
+// The Go allocator allocates objects on slabs (which it calls runtime.mspans),
+// which are large page-sized chunks divided into small pieces of the same size.
+// The sizes the Go allocator uses for slab objects are called size classes.
+//
+// This function discovers size classes using append(), which, when resizing,
+// always returns a slice whose capacity is a size class.
+//
+// This is used to trigger various checkptr corner cases in tests.
+func snapBytes(b []byte) []byte {
+	// Build a slice that is sized to a Go size class.
+	//
+	// This append lowers to a call to runtime.growslice, and, because
+	// sizeof(byte) == 1, triggers this bit of code at
+	// https://cs.opensource.google/go/go/+/refs/tags/go1.25.1:src/runtime/slice.go;l=210:
+	//
+	// 	lenmem = uintptr(oldLen)
+	// 	newlenmem = uintptr(newLen)
+	// 	capmem = roundupsize(uintptr(newcap), noscan)
+	// 	overflow = uintptr(newcap) > maxAlloc
+	// 	newcap = int(capmem)
+	//
+	// runtime.roundupsize(n, noscan) rounds n up to the next size class
+	// larger ot equal to it.
+	//
+	// We always round up the size class to at least 32, to avoid very tiny
+	// objects that share allocations.
+	buf := append([]byte(nil), make([]byte, max(32, len(b)))...)
+	buf = buf[:cap(buf):cap(buf)]
+
+	// Discard everything excelt the last len(b) bytes of buf.
+	buf = buf[len(buf)-len(b):]
+
+	copy(buf, b)
+	return buf
+}
diff --git a/internal/xunsafe/untyped.go b/internal/xunsafe/untyped.go
index fa36cf0..8b9beb6 100644
--- a/internal/xunsafe/untyped.go
+++ b/internal/xunsafe/untyped.go
@@ -14,7 +14,9 @@
 
 package xunsafe
 
-import "unsafe"
+import (
+	"unsafe"
+)
 
 // ByteAdd adds the given offset to p, without scaling.
 //