Skip to content

Pandas On Spark crashing on Windows when applying a custom function. #55824

@IMarvinTPA

Description

@IMarvinTPA

It could be something I'm doing wrong, but it is super easy to reproduce:

import pyspark.pandas as ppd
import pyspark.sql as ss


mySpark = ss.SparkSession.builder.appName('test') \
            .config("spark.sql.catalogImplementation", "hive") \
            .config("spark.sql.legacy.createHiveTableByDefault", "false")\
            .config("spark.sql.repl.eagerEval.enabled", True) \
            .config("spark.sql.sources.default", "parquet") \
            .config("spark.sql.execution.arrow.pyspark.enabled", "True") \
            .config("spark.sql.ansi.enabled", "false") \
            .config("spark.python.worker.faulthandler.enabled", "true") \
            .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", "true") \
            .enableHiveSupport().getOrCreate()

            # .config("spark.executor.memory", "6g") \
            # .config("spark.executor.memoryOverhead", "1g") \
            # .config("spark.delta.catalog.update.enabled", "false") \


DECIMAL_STR_LIST = ["1",
                    "2.2",
                    "3.3",
                    "44.4",
                    "555.55",
                    "654321.12",
                    "7.75",
                    "-800000.8",
                    None,
                    "1000"]


def noChange(val: str) -> str:
    return str(val)

cols = {"aDecimal82": DECIMAL_STR_LIST}
df = ppd.DataFrame(cols)
df["aDecimal82"] = df["aDecimal82"].apply(noChange)

print(df)

This code results in this error message:


Py4JJavaError Traceback (most recent call last)
Cell In[21], line 41
38 df = ppd.DataFrame(cols)
39 df["aDecimal82"] = df["aDecimal82"].apply(noChange)
---> 41 print(df)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13997, in DataFrame.repr(self)
13993 max_display_count = get_option("display.max_rows")
13994 if max_display_count is None:
13995 return self._to_internal_pandas().to_string()
13996

13997 pdf = cast("DataFrame", self._get_or_create_repr_pandas_cache(max_display_count))
13998 pdf_length = len(pdf)
13999 pdf = cast("DataFrame", pdf.iloc[:max_display_count])
14000 if pdf_length > max_display_count:

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13988, in DataFrame._get_or_create_repr_pandas_cache(self, n)
13985 def _get_or_create_repr_pandas_cache(self, n: int) -> Union[pd.DataFrame, pd.Series]:
13986 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
13987 object.setattr(

13988 self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()}
13989 )
13990 return self._repr_pandas_cache[n]

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\frame.py:13983, in DataFrame._to_internal_pandas(self)
13979 Return a pandas DataFrame directly from _internal to avoid overhead of copy.
13980
13981 This method is for internal use only.
13982 """

13983 return self._internal.to_pandas_frame

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\utils.py:608, in lazy_property..wrapped_lazy_property(self)
604 @Property
605 @functools.wraps(fn)
606 def wrapped_lazy_property(self):
607 if not hasattr(self, attr_name):
--> 608 setattr(self, attr_name, fn(self))
609 return getattr(self, attr_name)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\pandas\internal.py:1080, in InternalFrame.to_pandas_frame(self)
1078 """Return as pandas DataFrame."""
1079 sdf = self.to_internal_spark_frame
-> 1080 pdf = sdf._to_pandas(pandasStructHandlingMode="row")
1081 if len(pdf) == 0 and len(sdf.schema) > 0:
1082 pdf = pdf.astype(
1083 {field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}
1084 )

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:1959, in DataFrame._to_pandas(self, **kwargs)
1958 def _to_pandas(self, **kwargs: Any) -> "PandasDataFrameLike":
-> 1959 return PandasConversionMixin._to_pandas(self, **kwargs)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\pandas\conversion.py:408, in PandasConversionMixin._to_pandas(self, **kwargs)
405 raise
407 # Below is toPandas without Arrow optimization.
--> 408 rows = self.collect()
409 if len(rows) > 0:
410 pdf = pd.DataFrame.from_records(rows, index=range(len(rows)), columns=self.columns)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\sql\classic\dataframe.py:503, in DataFrame.collect(self)
501 def collect(self) -> List[Row]:
502 with SCCallSiteSync(self._sc):
--> 503 sock_info = self._jdf.collectToPython()
504 with _load_from_socket(sock_info, BatchedSerializer(CPickleSerializer())) as stream:
505 return list(stream)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py:1362, in JavaMember.call(self, *args)
1356 command = proto.CALL_COMMAND_NAME +
1357 self.command_header +
1358 args_command +
1359 proto.END_COMMAND_PART
1361 answer = self.gateway_client.send_command(command)
-> 1362 return_value = get_return_value(
1363 answer, self.gateway_client, self.target_id, self.name)
1365 for temp_arg in temp_args:
1366 if hasattr(temp_arg, "_detach"):

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyspark\errors\exceptions\captured.py:251, in capture_sql_exception..deco(*a, **kw)
248 from py4j.protocol import Py4JJavaError
250 try:
--> 251 return f(*a, **kw)
252 except Py4JJavaError as e:
253 converted = convert_exception(e.java_exception)

File ...\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\protocol.py:327, in get_return_value(answer, gateway_client, target_id, name)
325 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
326 if answer[1] == REFERENCE_TYPE:
--> 327 raise Py4JJavaError(
328 "An error occurred while calling {0}{1}{2}.\n".
329 format(target_id, ".", name), value)
330 else:
331 raise Py4JError(
332 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
333 format(target_id, ".", name, value))

Py4JJavaError: An error occurred while calling o4134.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 240.0 failed 1 times, most recent failure: Lost task 8.0 in stage 240.0 (TID 806) (mycomputername executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
at org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
at org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
at org.apache.spark.scheduler.Task.run(Task.scala:147)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: An established connection was aborted by the software in your host machine
at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
at java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
... 45 more

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:3304)
at scala.Option.getOrElse(Option.scala:201)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3304)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3296)
at scala.collection.immutable.List.foreach(List.scala:323)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3296)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1353)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1353)
at scala.Option.foreach(Option.scala:437)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1353)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3575)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3503)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3492)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1053)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2496)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2517)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2536)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:544)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:497)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:58)
at org.apache.spark.sql.classic.Dataset.$anonfun$collectToPython$1(Dataset.scala:2084)
at org.apache.spark.sql.classic.Dataset.$anonfun$withAction$2(Dataset.scala:2264)
at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:743)
at org.apache.spark.sql.classic.Dataset.$anonfun$withAction$1(Dataset.scala:2262)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$8(SQLExecution.scala:228)
at org.apache.spark.sql.execution.SQLExecution$.withSessionTagsApplied(SQLExecution.scala:352)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$7(SQLExecution.scala:189)
at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:106)
at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:189)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:375)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:188)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:810)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:130)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:317)
at org.apache.spark.sql.classic.Dataset.withAction(Dataset.scala:2262)
at org.apache.spark.sql.classic.Dataset.collectToPython(Dataset.scala:2080)
at jdk.internal.reflect.GeneratedMethodAccessor165.invoke(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:568)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:711)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:686)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1068)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1045)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream(PythonArrowInput.scala:190)
at org.apache.spark.sql.execution.python.BatchedPythonArrowInput.writeNextBatchToArrowStream$(PythonArrowInput.scala:180)
at org.apache.spark.sql.execution.python.ArrowEvalPythonEvaluatorFactory$$anon$2.writeNextBatchToArrowStream(ArrowEvalPythonExec.scala:198)
at org.apache.spark.sql.execution.python.PythonArrowInput$$anon$1.writeNextInputToStream(PythonArrowInput.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:964)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:121)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:602)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:611)
at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:593)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:402)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:910)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:910)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:57)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:374)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:338)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:206)
at org.apache.spark.scheduler.Task.run(Task.scala:147)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:894)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:897)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
... 1 more
Caused by: java.io.IOException: An established connection was aborted by the software in your host machine
at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)
at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:53)
at java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:532)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:975)
at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:879)
at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244)
at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:263)
at java.base/java.io.DataInputStream.readInt(DataInputStream.java:393)
at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1053)
... 45 more
Versions in the environment:
Python 3.12.10
Running Java 17.0.0.1
pandas Version: 2.2.3
spark Version: 4.2.0-preview5
numpy Version: 2.1.3
pyArrow Version: 19.0.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions