From ab10d2f21423af7c8c447afac4794df29e61b88e Mon Sep 17 00:00:00 2001 From: Jianhua Zheng Date: Wed, 26 Jun 2024 10:35:28 +0000 Subject: [PATCH 1/3] autocast support new devices --- oneflow/api/python/framework/autocast.cpp | 31 ++++++++++++------- oneflow/core/framework/autocast.cpp | 2 +- .../auto_mixed_precision_lists.cpp | 5 ++- python/oneflow/amp/autocast_mode.py | 6 ++-- .../utils/tensor/from_or_to_torch_tensor.py | 16 ++++++++-- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/oneflow/api/python/framework/autocast.cpp b/oneflow/api/python/framework/autocast.cpp index 506718e6dd8..f47e5fb91cc 100644 --- a/oneflow/api/python/framework/autocast.cpp +++ b/oneflow/api/python/framework/autocast.cpp @@ -16,7 +16,9 @@ limitations under the License. #include #include "oneflow/api/python/of_api_registry.h" +#include "oneflow/core/common/device_type.pb.h" #include "oneflow/core/common/throw.h" +#include "oneflow/core/ep/include/device_manager_registry.h" #include "oneflow/core/framework/autocast.h" namespace py = pybind11; @@ -36,7 +38,7 @@ class AutoCastMode { public: OF_DISALLOW_COPY_AND_MOVE(AutoCastMode); - AutoCastMode(const std::string& device_type, Symbol dtype, bool enabled, + AutoCastMode(const std::string& device_name, Symbol dtype, bool enabled, bool cache_enabled) : prev_enabled_(autocast::is_enabled()), prev_cache_enabled_(autocast::is_autocast_cache_enabled()), @@ -48,16 +50,23 @@ class AutoCastMode { increase_nested_count(); autocast::set_enabled(enabled); autocast::set_autocast_cache_enabled(cache_enabled); - if (device_type == "cpu") { - autocast::set_autocast_device_type(kCPU); - autocast::set_autocast_dtype(dtype); - autocast::set_autocast_cpu_dtype(dtype); - } else if (device_type == "cuda") { - autocast::set_autocast_device_type(kCUDA); - autocast::set_autocast_dtype(dtype); - autocast::set_autocast_gpu_dtype(dtype); - } else { - THROW(RuntimeError) << "User specified autocast device_type must be 'cuda' or 'cpu'"; + auto device_type = ep::DeviceManagerRegistry::GetDeviceTypeByDeviceTypeName(device_name); + switch (device_type) { + case kCPU: + autocast::set_autocast_device_type(device_type); + autocast::set_autocast_dtype(dtype); + autocast::set_autocast_cpu_dtype(dtype); + break; + case kCUDA: + case kMLU: + case kNPU: + autocast::set_autocast_device_type(device_type); + autocast::set_autocast_dtype(dtype); + autocast::set_autocast_gpu_dtype(dtype); + break; + default: + THROW(RuntimeError) + << "User specified autocast device_type must be 'cuda' or 'cpu' or 'mlu' or 'npu'"; } } diff --git a/oneflow/core/framework/autocast.cpp b/oneflow/core/framework/autocast.cpp index f3ce8320183..1817467d843 100644 --- a/oneflow/core/framework/autocast.cpp +++ b/oneflow/core/framework/autocast.cpp @@ -181,7 +181,7 @@ std::shared_ptr MakeAutoCastMeta( // autocast only supports the following device type(s) and low precision type(s): // - device type: CUDA // - low precision type: half, bfloat16 - static std::vector autocast_device_types{kCUDA}; + static std::vector autocast_device_types{kCUDA, kMLU, kNPU}; static std::vector> autocast_dtypes{DType::Float16(), DType::BFloat16()}; if (autocast_meta->autocast_color() != kBlack) { diff --git a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp index 0a3a64b330b..8fca38bd681 100644 --- a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp +++ b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp @@ -20,7 +20,9 @@ namespace oneflow { const AMPList& AutoMixedPrecisionLists::WhiteList() { static AMPList white_list = {"matmul", "batch_matmul", + "conv1d", "conv2d", + "conv3d", "conv_data_grad", "conv_filter_grad", "conv_bias_grad", @@ -137,7 +139,8 @@ const AMPList& AutoMixedPrecisionLists::GrayList() { "group_norm_grad", "silu", "silu_grad", - "fused_weighted_sum"}; + "fused_weighted_sum", + "cast"}; return gray_list; } diff --git a/python/oneflow/amp/autocast_mode.py b/python/oneflow/amp/autocast_mode.py index 3fbaf429566..a23be0ba5a0 100644 --- a/python/oneflow/amp/autocast_mode.py +++ b/python/oneflow/amp/autocast_mode.py @@ -153,10 +153,10 @@ def __init__( cache_enabled: Optional[bool] = None, ): self.device = device_type - if self.device == "cuda": - self.fast_dtype = flow.get_autocast_gpu_dtype() - elif self.device == "cpu": + if self.device == "cpu": self.fast_dtype = flow.get_autocast_cpu_dtype() + elif self.device in ["cuda", "mlu", "npu"]: + self.fast_dtype = flow.get_autocast_gpu_dtype() else: raise RuntimeError( "User specified autocast device_type must be 'cuda' or 'cpu'" diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py index c1e6cbab3b2..a769e6dbc3b 100644 --- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py +++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py @@ -62,7 +62,13 @@ def from_torch(torch_tensor): except: print_error_msg() assert isinstance(torch_tensor, torch.Tensor) - return flow.from_dlpack(torch.to_dlpack(torch_tensor)) + # return flow.from_dlpack(torch.to_dlpack(torch_tensor)) + dtype = flow.float16 + if torch_tensor.dtype == torch.int64: + dtype = flow.int64 + elif torch_tensor.dtype != torch.float16: + print(torch_tensor.dtype) + return flow.tensor(torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype).reshape([x for x in torch_tensor.shape]) def to_torch(flow_tensor): @@ -104,4 +110,10 @@ def to_torch(flow_tensor): "WARNING: `to_torch` received a global tensor. A PyTorch CPU tensor which is a copy of its data will be returned." ) return torch.from_numpy(flow_tensor.numpy()) - return torch.from_dlpack(flow.to_dlpack(flow_tensor)) + # return torch.from_dlpack(flow.to_dlpack(flow_tensor)) + dtype = torch.float16 + if flow_tensor.dtype == flow.int64: + dtype = torch.int64 + elif flow_tensor.dtype != flow.float16: + print(flow_tensor.dtype) + return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape([x for x in flow_tensor.shape]) From 62bcb0970298129aeab1378dd822520211ca99ba Mon Sep 17 00:00:00 2001 From: oneflow-ci-bot Date: Wed, 26 Jun 2024 12:18:04 +0000 Subject: [PATCH 2/3] auto format by CI --- python/oneflow/utils/tensor/from_or_to_torch_tensor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py index a769e6dbc3b..efd697270db 100644 --- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py +++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py @@ -68,7 +68,9 @@ def from_torch(torch_tensor): dtype = flow.int64 elif torch_tensor.dtype != torch.float16: print(torch_tensor.dtype) - return flow.tensor(torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype).reshape([x for x in torch_tensor.shape]) + return flow.tensor( + torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype + ).reshape([x for x in torch_tensor.shape]) def to_torch(flow_tensor): @@ -116,4 +118,6 @@ def to_torch(flow_tensor): dtype = torch.int64 elif flow_tensor.dtype != flow.float16: print(flow_tensor.dtype) - return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape([x for x in flow_tensor.shape]) + return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape( + [x for x in flow_tensor.shape] + ) From b5d27730d542d6ab5f52f61a5847dcf8b8ddd809 Mon Sep 17 00:00:00 2001 From: Jianhua Zheng Date: Thu, 27 Jun 2024 02:42:47 +0000 Subject: [PATCH 3/3] revert from_or_to_torch_tensor.py --- .../utils/tensor/from_or_to_torch_tensor.py | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py index efd697270db..c1e6cbab3b2 100644 --- a/python/oneflow/utils/tensor/from_or_to_torch_tensor.py +++ b/python/oneflow/utils/tensor/from_or_to_torch_tensor.py @@ -62,15 +62,7 @@ def from_torch(torch_tensor): except: print_error_msg() assert isinstance(torch_tensor, torch.Tensor) - # return flow.from_dlpack(torch.to_dlpack(torch_tensor)) - dtype = flow.float16 - if torch_tensor.dtype == torch.int64: - dtype = flow.int64 - elif torch_tensor.dtype != torch.float16: - print(torch_tensor.dtype) - return flow.tensor( - torch_tensor.cpu().numpy(), device=flow.device("npu"), dtype=dtype - ).reshape([x for x in torch_tensor.shape]) + return flow.from_dlpack(torch.to_dlpack(torch_tensor)) def to_torch(flow_tensor): @@ -112,12 +104,4 @@ def to_torch(flow_tensor): "WARNING: `to_torch` received a global tensor. A PyTorch CPU tensor which is a copy of its data will be returned." ) return torch.from_numpy(flow_tensor.numpy()) - # return torch.from_dlpack(flow.to_dlpack(flow_tensor)) - dtype = torch.float16 - if flow_tensor.dtype == flow.int64: - dtype = torch.int64 - elif flow_tensor.dtype != flow.float16: - print(flow_tensor.dtype) - return torch.tensor(flow_tensor.numpy(), device="npu", dtype=dtype).reshape( - [x for x in flow_tensor.shape] - ) + return torch.from_dlpack(flow.to_dlpack(flow_tensor))