From ab10d2f21423af7c8c447afac4794df29e61b88e Mon Sep 17 00:00:00 2001
From: Jianhua Zheng <zhengjianhua@oneflow.org>
Date: Wed, 26 Jun 2024 10:35:28 +0000
Subject: [PATCH 1/3] autocast support new devices

---
 oneflow/api/python/framework/autocast.cpp     | 31 ++++++++++++-------
 oneflow/core/framework/autocast.cpp           |  2 +-
 .../auto_mixed_precision_lists.cpp            |  5 ++-
 python/oneflow/amp/autocast_mode.py           |  6 ++--
 .../utils/tensor/from_or_to_torch_tensor.py   | 16 ++++++++--
 5 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/oneflow/api/python/framework/autocast.cpp b/oneflow/api/python/framework/autocast.cpp
index 506718e6dd8..f47e5fb91cc 100644
--- a/oneflow/api/python/framework/autocast.cpp
+++ b/oneflow/api/python/framework/autocast.cpp
@@ -16,7 +16,9 @@ limitations under the License.
 #include <pybind11/pybind11.h>
 #include "oneflow/api/python/of_api_registry.h"
 
+#include "oneflow/core/common/device_type.pb.h"
 #include "oneflow/core/common/throw.h"
+#include "oneflow/core/ep/include/device_manager_registry.h"
 #include "oneflow/core/framework/autocast.h"
 
 namespace py = pybind11;
@@ -36,7 +38,7 @@ class AutoCastMode {
  public:
   OF_DISALLOW_COPY_AND_MOVE(AutoCastMode);
 
-  AutoCastMode(const std::string& device_type, Symbol<DType> dtype, bool enabled,
+  AutoCastMode(const std::string& device_name, Symbol<DType> dtype, bool enabled,
                bool cache_enabled)
       : prev_enabled_(autocast::is_enabled()),
         prev_cache_enabled_(autocast::is_autocast_cache_enabled()),
@@ -48,16 +50,23 @@ class AutoCastMode {
     increase_nested_count();
     autocast::set_enabled(enabled);
     autocast::set_autocast_cache_enabled(cache_enabled);
-    if (device_type == "cpu") {
-      autocast::set_autocast_device_type(kCPU);
-      autocast::set_autocast_dtype(dtype);
-      autocast::set_autocast_cpu_dtype(dtype);
-    } else if (device_type == "cuda") {
-      autocast::set_autocast_device_type(kCUDA);
-      autocast::set_autocast_dtype(dtype);
-      autocast::set_autocast_gpu_dtype(dtype);
-    } else {
-      THROW(RuntimeError) << "User specified autocast device_type must be 'cuda' or 'cpu'";
+    auto device_type = ep::DeviceManagerRegistry::GetDeviceTypeByDeviceTypeName(device_name);
+    switch (device_type) {
+      case kCPU:
+        autocast::set_autocast_device_type(device_type);
+        autocast::set_autocast_dtype(dtype);
+        autocast::set_autocast_cpu_dtype(dtype);
+        break;
+      case kCUDA:
+      case kMLU:
+      case kNPU:
+        autocast::set_autocast_device_type(device_type);
+        autocast::set_autocast_dtype(dtype);
+        autocast::set_autocast_gpu_dtype(dtype);
+        break;
+      default:
+        THROW(RuntimeError)
+            << "User specified autocast device_type must be 'cuda' or 'cpu' or 'mlu' or 'npu'";
     }
   }
 
diff --git a/oneflow/core/framework/autocast.cpp b/oneflow/core/framework/autocast.cpp
index f3ce8320183..1817467d843 100644
--- a/oneflow/core/framework/autocast.cpp
+++ b/oneflow/core/framework/autocast.cpp
@@ -181,7 +181,7 @@ std::shared_ptr<AutoCastMeta> MakeAutoCastMeta(
   // autocast only supports the following device type(s) and low precision type(s):
   //   - device type: CUDA
   //   - low precision type: half, bfloat16
-  static std::vector<DeviceType> autocast_device_types{kCUDA};
+  static std::vector<DeviceType> autocast_device_types{kCUDA, kMLU, kNPU};
   static std::vector<Symbol<DType>> autocast_dtypes{DType::Float16(), DType::BFloat16()};
 
   if (autocast_meta->autocast_color() != kBlack) {
diff --git a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
index 0a3a64b330b..8fca38bd681 100644
--- a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
+++ b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp
@@ -20,7 +20,9 @@ namespace oneflow {
 const AMPList& AutoMixedPrecisionLists::WhiteList() {
   static AMPList white_list = {"matmul",
                                "batch_matmul",
+                               "conv1d",
                                "conv2d",
+                               "conv3d",
                                "conv_data_grad",
                                "conv_filter_grad",
                                "conv_bias_grad",
@@ -137,7 +139,8 @@ const AMPList& AutoMixedPrecisionLists::GrayList() {
                               "group_norm_grad",
                               "silu",
                               "silu_grad",
-                              "fused_weighted_sum"};
+                              "fused_weighted_sum",
+                              "cast"};
   return gray_list;
 }
 
diff --git a/python/oneflow/amp/autocast_mode.py b/python/oneflow/amp/autocast_mode.py
index 3fbaf429566..a23be0ba5a0 100644
--- a/python/oneflow/amp/autocast_mode.py
+++ b/python/oneflow/amp/autocast_mode.py
@@ -153,10 +153,10 @@ def __init__(
         cache_enabled: Optional[bool] = None,
     ):
         self.device = device_type
-        if self.device == "cuda":
-            self.fast_dtype = flow.get_autocast_gpu_dtype()
-        elif self.device == "cpu":
+        if self.device == "cpu":
             self.fast_dtype = flow.get_autocast_cpu_dtype()
+        elif self.device in ["cuda", "mlu", "npu"]:
+            self.fast_dtype = flow.get_autocast_gpu_dtype()
         else:
             raise RuntimeError(
                 "User specified autocast device_type must be 'cuda' or 'cpu'"
diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
index c1e6cbab3b2..a769e6dbc3b 100644
--- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
+++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
@@ -62,7 +62,13 @@ def from_torch(torch_tensor):
     except:
         print_error_msg()
     assert isinstance(torch_tensor, torch.Tensor)
-    return flow.from_dlpack(torch.to_dlpack(torch_tensor))
+    # return flow.from_dlpack(torch.to_dlpack(torch_tensor))
+    dtype = flow.float16
+    if torch_tensor.dtype == torch.int64:
+        dtype = flow.int64
+    elif torch_tensor.dtype != torch.float16:
+        print(torch_tensor.dtype)
+    return flow.tensor(torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype).reshape([x for x in torch_tensor.shape])
 
 
 def to_torch(flow_tensor):
@@ -104,4 +110,10 @@ def to_torch(flow_tensor):
             "WARNING: `to_torch` received a global tensor. A PyTorch CPU tensor which is a copy of its data will be returned."
         )
         return torch.from_numpy(flow_tensor.numpy())
-    return torch.from_dlpack(flow.to_dlpack(flow_tensor))
+    # return torch.from_dlpack(flow.to_dlpack(flow_tensor))
+    dtype = torch.float16
+    if flow_tensor.dtype == flow.int64:
+        dtype = torch.int64
+    elif flow_tensor.dtype != flow.float16:
+        print(flow_tensor.dtype)
+    return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape([x for x in flow_tensor.shape])

From 62bcb0970298129aeab1378dd822520211ca99ba Mon Sep 17 00:00:00 2001
From: oneflow-ci-bot <ci-bot@oneflow.org>
Date: Wed, 26 Jun 2024 12:18:04 +0000
Subject: [PATCH 2/3] auto format by CI

---
 python/oneflow/utils/tensor/from_or_to_torch_tensor.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
index a769e6dbc3b..efd697270db 100644
--- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
+++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
@@ -68,7 +68,9 @@ def from_torch(torch_tensor):
         dtype = flow.int64
     elif torch_tensor.dtype != torch.float16:
         print(torch_tensor.dtype)
-    return flow.tensor(torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype).reshape([x for x in torch_tensor.shape])
+    return flow.tensor(
+        torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype
+    ).reshape([x for x in torch_tensor.shape])
 
 
 def to_torch(flow_tensor):
@@ -116,4 +118,6 @@ def to_torch(flow_tensor):
         dtype = torch.int64
     elif flow_tensor.dtype != flow.float16:
         print(flow_tensor.dtype)
-    return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape([x for x in flow_tensor.shape])
+    return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape(
+        [x for x in flow_tensor.shape]
+    )

From b5d27730d542d6ab5f52f61a5847dcf8b8ddd809 Mon Sep 17 00:00:00 2001
From: Jianhua Zheng <zhengjianhua@oneflow.org>
Date: Thu, 27 Jun 2024 02:42:47 +0000
Subject: [PATCH 3/3] revert from_or_to_torch_tensor.py

---
 .../utils/tensor/from_or_to_torch_tensor.py   | 20 ++-----------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
index efd697270db..c1e6cbab3b2 100644
--- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
+++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py
@@ -62,15 +62,7 @@ def from_torch(torch_tensor):
     except:
         print_error_msg()
     assert isinstance(torch_tensor, torch.Tensor)
-    # return flow.from_dlpack(torch.to_dlpack(torch_tensor))
-    dtype = flow.float16
-    if torch_tensor.dtype == torch.int64:
-        dtype = flow.int64
-    elif torch_tensor.dtype != torch.float16:
-        print(torch_tensor.dtype)
-    return flow.tensor(
-        torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype
-    ).reshape([x for x in torch_tensor.shape])
+    return flow.from_dlpack(torch.to_dlpack(torch_tensor))
 
 
 def to_torch(flow_tensor):
@@ -112,12 +104,4 @@ def to_torch(flow_tensor):
             "WARNING: `to_torch` received a global tensor. A PyTorch CPU tensor which is a copy of its data will be returned."
         )
         return torch.from_numpy(flow_tensor.numpy())
-    # return torch.from_dlpack(flow.to_dlpack(flow_tensor))
-    dtype = torch.float16
-    if flow_tensor.dtype == flow.int64:
-        dtype = torch.int64
-    elif flow_tensor.dtype != flow.float16:
-        print(flow_tensor.dtype)
-    return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape(
-        [x for x in flow_tensor.shape]
-    )
+    return torch.from_dlpack(flow.to_dlpack(flow_tensor))