jax-ml
diff --git a/‎jaxlib/xla/pjit.cc
+18-24 b/‎jaxlib/xla/pjit.cc
+18-24
diff --git a/‎jaxlib/xla/pmap_lib.cc
+16-56 b/‎jaxlib/xla/pmap_lib.cc
+16-56
diff --git a/‎jaxlib/xla/py_array.cc
+13-61 b/‎jaxlib/xla/py_array.cc
+13-61
diff --git a/‎jaxlib/xla/py_client.cc
+15-19 b/‎jaxlib/xla/py_client.cc
+15-19
@@ -434,13 +434,15 @@ void CallShardArgFallback(
 // Prepares the input PjRtBuffers from the python arguments. This is equivalent
 // to shard_args() in pxla.py but for only a few supported cases.
 absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
-PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
-                  absl::Span<nb::object const> flat_dynamic_args,
-                  bool enable_x64, const std::vector<bool>& kept_args,
-                  const std::vector<nb::object>& in_shardings,
-                  const std::vector<nb::object>& in_device_local_layouts,
-                  const nb::callable& shard_arg_fallback,
-                  std::vector<nb::object>& keep_alive_objects) {
+PrepareIfrtInputs(
+    const xla::PyLoadedExecutable& executable,
+    absl::Span<nb::object const> flat_dynamic_args,
+    absl::Span<xla::PyArgSignature const> flat_dynamic_arg_signatures,
+    bool enable_x64, const std::vector<bool>& kept_args,
+    const std::vector<nb::object>& in_shardings,
+    const std::vector<nb::object>& in_device_local_layouts,
+    const nb::callable& shard_arg_fallback,
+    std::vector<nb::object>& keep_alive_objects) {
   const auto& addressable_devices =
       executable.ifrt_loaded_executable()->addressable_devices();
   const auto& num_global_devices =
@@ -484,20 +486,11 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
         TF_RETURN_IF_ERROR(
             jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
         TF_ASSIGN_OR_RETURN(
-            auto on_device_fn,
-            DevicePut(arg, executable.ifrt_loaded_executable()->client(),
-                      data_device, options, xla::ifrt::MemoryKind()));
-        TF_ASSIGN_OR_RETURN(xla::DevicePutResult on_device, [&]() {
-          // Must release the GIL before calling IFRT because backends may
-          // decide to block/sleep for device buffer allocation.
-          nb::gil_scoped_release gil_release;
-          return std::move(on_device_fn)();
-        }());
-
-        num_args_arrays.push_back(std::move(on_device.ifrt_array));
-        if (on_device.owning_pybuffer) {
-          keep_alive_objects.push_back(std::move(on_device.owning_pybuffer));
-        }
+            auto device_put_result,
+            DevicePutWithDevice(arg,
+                                executable.ifrt_loaded_executable()->client(),
+                                data_device, xla::ifrt::MemoryKind(), options));
+        num_args_arrays.push_back(std::move(device_put_result.ifrt_array));
         continue;
       } else {
         CallShardArgFallback(arg, in_shardings[dce_index],
@@ -750,9 +743,10 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   // A vector of [num_inputs].
   auto num_args_arrays = PrepareIfrtInputs(
       *cache_entry->executable, flat_dynamic_args,
-      call_signature.jax_enable_x64, cache_entry->kept_var_bitvec,
-      cache_entry->in_shardings, cache_entry->in_device_local_layouts,
-      shard_arg_fallback_, keep_alive_objects);
+      call_signature.dynamic_arg_signatures, call_signature.jax_enable_x64,
+      cache_entry->kept_var_bitvec, cache_entry->in_shardings,
+      cache_entry->in_device_local_layouts, shard_arg_fallback_,
+      keep_alive_objects);
 
   if (!num_args_arrays.ok()) {
     VLOG(2) << "Failed to prepare IFRT inputs: " << num_args_arrays.status();
 
@@ -56,7 +56,6 @@ limitations under the License.
 #include "jaxlib/xla/pytree.h"
 #include "jaxlib/xla/sharded_device_array.h"
 #include "jaxlib/xla/sharding.h"
-#include "jaxlib/xla/to_ifrt_sharding.h"
 #include "jaxlib/xla/traceback.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/status_casters.h"
@@ -65,7 +64,6 @@ limitations under the License.
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/memory.h"
-#include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
@@ -186,74 +184,36 @@ absl::StatusOr<ShardArgResult> ShardArg(
                                   indices.size(), n_devices);
     }
 
-    std::vector<tsl::RCReference<xla::ifrt::Array>> per_device_arrays;
-    per_device_arrays.reserve(n_devices);
-    absl::InlinedVector<xla::ifrt::Device*, 1> devices;
-    devices.reserve(n_devices);
-    // TODO(hyeontaek): The created array will never be disassembled. We should
-    // omit collecting shapes and make the OpaqueSharding non-disassemblable?
-    std::vector<xla::ifrt::Shape> shapes;
-    shapes.reserve(n_devices);
-
-    nb::list owning_pylist;
     ShardArgResult result;
-    result.owning_sda = owning_pylist;
     const bool jax_enable_x64 = GetEnableX64();
 
-    std::vector<xla::DevicePutResultFn> device_put_fns;
-    device_put_fns.reserve(n_devices);
+    std::vector<nb::object> owning_args;
+    std::vector<nb::handle> args;
+    owning_args.reserve(n_devices);
+    args.reserve(n_devices);
     xla::DevicePutOptions options;
     options.squash_64bit_types = !jax_enable_x64;
     options.allow_zero_copy = true;
+    xla::ifrt::Client* ifrt_client = nullptr;
     for (size_t i = 0; i < n_devices; ++i) {
       auto to_device = nb::cast<xla::PyDevice*>(py_devices_list[i]);
       if (to_device->client().get() == nullptr) {
         return xla::InvalidArgument("Cannot copy to unattached devices.");
       }
-
-      TF_ASSIGN_OR_RETURN(
-          device_put_fns.emplace_back(),
-          DevicePut(arg[indices[i]], to_device->client()->ifrt_client(),
-                    to_device->device(), options, xla::ifrt::MemoryKind()));
-    }
-    std::vector<xla::DevicePutResult> device_puts;
-    device_puts.reserve(n_devices);
-    {
-      nb::gil_scoped_release gil_release;
-      for (auto& device_put_fn : device_put_fns) {
-        TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
-        device_puts.push_back(std::move(device_put));
-      }
-    }
-    for (auto& device_put : device_puts) {
-      per_device_arrays.push_back(std::move(device_put.ifrt_array));
-      devices.push_back(
-          per_device_arrays.back()->sharding().devices()->devices().front());
-      shapes.push_back(per_device_arrays.back()->shape());
-      if (device_put.owning_pybuffer) {
-        owning_pylist.append(device_put.owning_pybuffer);
+      if (i == 0) {
+        ifrt_client = to_device->client()->ifrt_client();
       }
+      owning_args.push_back(arg[indices[i]]);
+      args.push_back(owning_args.back());
     }
-
-    if (per_device_arrays.empty()) {
-      return xla::InvalidArgument("Per-device arrays must not be empty.");
-    }
-    // TODO(hyeontaek): The logical shape here is inaccurate. We
-    // may want to avoid creating a new Array or specialize Array
-    // to disallow access to the logical shape.
-    xla::ifrt::Shape shape = per_device_arrays.front()->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto ifrt_sharding,
-        xla::GetIfrtConcreteSharding(input_spec.array_sharding, shape, shapes));
+    CHECK(ifrt_client != nullptr);
     TF_ASSIGN_OR_RETURN(
-        result.ifrt_array,
-        per_device_arrays.front()
-            ->client()
-            ->AssembleArrayFromSingleDeviceArrays(
-                std::move(shape), std::move(ifrt_sharding),
-                absl::MakeSpan(per_device_arrays),
-                xla::ifrt::ArrayCopySemantics::kReuseInput,
-                xla::ifrt::SingleDeviceShardSemantics::kAddressableShards));
+        xla::DevicePutResult device_put_result,
+        xla::DevicePutWithSharding(
+            args, ifrt_client, ndarray.dtype(),
+            nb::cast<std::vector<int64_t>>(ndarray.attr("shape")),
+            input_spec.array_sharding, options));
+    result.ifrt_array = std::move(device_put_result.ifrt_array);
     return result;
   }
   tsl::profiler::TraceMe traceme("pmap_lib_shard_arg_python_fallback");
 
@@ -1257,89 +1257,41 @@ absl::StatusOr<PyArray> PyArray::BatchedDevicePut(
   options.allow_zero_copy =
       (!force_copy && (host_buffer_semantics ==
                        ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
-  if (!dst_devices.empty()) {
-    options.ifrt_user_context =
-        dst_devices.front()->client()->ifrt_client()->CreateUserContext();
-  }
 
-  nb::list owning_pylist;
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
 
   absl::InlinedVector<ifrt::Device*, 1> devices;
   devices.reserve(n_devices);
   std::vector<xla::ifrt::Shape> shapes;
   shapes.reserve(n_devices);
 
-  ifrt::MemoryKind dst_memory_kind = xla::GetMemoryKind(sharding);
-
-  std::vector<DevicePutResultFn> device_put_fns;
-  device_put_fns.reserve(xs.size());
-  size_t i = 0;
-  for (auto& x : xs) {
+  std::vector<nb::handle> args;
+  args.reserve(xs.size());
+  for (const nb::object& x : xs) {
     if (PyArray::IsPyArray(x)) {
       TF_RETURN_IF_ERROR(
           jax::ApplyTransferGuardToDeviceToDevice(transfer_guard_formatter));
     } else {
       TF_RETURN_IF_ERROR(
           jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
     }
-    TF_ASSIGN_OR_RETURN(
-        device_put_fns.emplace_back(),
-        DevicePut(x, dst_devices[i]->client()->ifrt_client(),
-                  dst_devices[i]->device(), options, dst_memory_kind));
-    ++i;
-  }
-  std::vector<DevicePutResult> device_puts;
-  device_puts.reserve(device_put_fns.size());
-  {
-    nb::gil_scoped_release gil_release;
-    for (auto& device_put_fn : device_put_fns) {
-      TF_ASSIGN_OR_RETURN(auto device_put, std::move(device_put_fn)());
-      device_puts.push_back(std::move(device_put));
-    }
-  }
-  for (auto& device_put : device_puts) {
-    ifrt_arrays.push_back(std::move(device_put.ifrt_array));
-    devices.push_back(
-        ifrt_arrays.back()->sharding().devices()->devices().front());
-    shapes.push_back(ifrt_arrays.back()->shape());
-    if (device_put.owning_pybuffer) {
-      owning_pylist.append(device_put.owning_pybuffer);
-    }
+    args.push_back(x);
   }
-
-  // TODO(phawkins): it's highly suspicious to me that owning_pylist isn't
-  // consumed here. Look into this.
-
   auto weak_type = nb::cast<bool>(aval.attr("weak_type"));
   auto dtype = aval.attr("dtype");
   auto shape = nb::cast<std::vector<int64_t>>(aval.attr("shape"));
+  TF_ASSIGN_OR_RETURN(nb_class_ptr<jax::PyDeviceList> py_device_list,
+                      jax::GetPyDeviceList(sharding));
 
   TF_ASSIGN_OR_RETURN(
-      auto ifrt_sharding,
-      sharding.type().is(jax::PmapSharding::type())
-          ? xla::GetIfrtConcreteSharding(sharding, ifrt::Shape(shape),
-                                         std::move(shapes))
-          : xla::GetIfrtHloSharding(sharding, ifrt::Shape(shape)));
-  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, DtypeToIfRtDType(dtype));
-  // TODO(emilyaf): Remove the following and just use ifrt_dtype when tokens are
-  // supported.
-  ifrt::DType array_dtype =
-      ifrt_arrays.empty() ? ifrt_dtype : ifrt_arrays.front()->dtype();
-  TF_ASSIGN_OR_RETURN(auto py_device_list, jax::GetPyDeviceList(sharding));
-  TF_ASSIGN_OR_RETURN(
-      auto ifrt_array,
-      py_device_list->py_client()
-          ->ifrt_client()
-          ->AssembleArrayFromSingleDeviceArrays(
-              array_dtype, ifrt::Shape(shape), std::move(ifrt_sharding),
-              absl::MakeSpan(ifrt_arrays),
-              xla::ifrt::ArrayCopySemantics::kReuseInput,
-              xla::ifrt::SingleDeviceShardSemantics::kAddressableShards));
-
-  return PyArray(aval, weak_type, dtype, std::move(shape), sharding,
+      DevicePutResult device_put_result,
+      DevicePutWithSharding(args, py_device_list->py_client()->ifrt_client(),
+                            dtype, shape, sharding, options));
+
+  return PyArray(aval, weak_type, dtype, std::move(shape), std::move(sharding),
                  py_device_list->py_client(), Traceback::Get(),
-                 std::move(ifrt_array), committed, /*skip_checks=*/true);
+                 std::move(device_put_result.ifrt_array), committed,
+                 /*skip_checks=*/true);
 }
 
 absl::StatusOr<PyArray> PyArray::ReorderShards(
 
@@ -57,6 +57,7 @@ limitations under the License.
 #include "jaxlib/xla/py_memory_space.h"
 #include "jaxlib/xla/py_values.h"
 #include "jaxlib/xla/python_ref_manager.h"
+#include "jaxlib/xla/sharding.h"
 #include "jaxlib/xla/traceback.h"
 #include "xla/literal.h"
 #include "xla/pjrt/exceptions.h"
@@ -66,6 +67,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -339,25 +341,19 @@ absl::Status PyClient::Defragment() {
   options.allow_zero_copy =
       (!force_copy && (host_buffer_semantics ==
                        ifrt::Client::HostBufferSemantics::kImmutableZeroCopy));
-  TF_ASSIGN_OR_RETURN(auto put_fn,
-                      DevicePut(argument, client->ifrt_client_.get(), device,
-                                options, ifrt::MemoryKind()));
-  TF_ASSIGN_OR_RETURN(auto put, [&]() {
-    // Must release the GIL before calling IFRT because backends may
-    // decide to block/sleep for device buffer allocation.
-    nb::gil_scoped_release gil_release;
-    return std::move(put_fn)();
-  }());
-
-  if (put.ifrt_array) {
-    auto traceback = Traceback::Get();
-    return PyArray::MakeFromSingleDeviceArray(
-        std::move(client), std::move(traceback), std::move(put.ifrt_array),
-        /*weak_type=*/false,
-        /*committed=*/false);
-  } else {
-    return put.owning_pybuffer;
-  }
+  TF_ASSIGN_OR_RETURN(DevicePutResult device_put_result,
+                      DevicePutWithDevice(argument, client->ifrt_client_.get(),
+                                          device, ifrt::MemoryKind(), options));
+  auto sharding = make_nb_class<jax::SingleDeviceSharding>(
+      client, client->ifrt_client()->MakeDeviceList({device}),
+      /*memory_kind=*/nb::none());
+
+  auto traceback = Traceback::Get();
+  return PyArray::MakeFromIfrtArrayAndSharding(
+      std::move(client), std::move(traceback),
+      std::move(device_put_result.ifrt_array), std::move(sharding),
+      /*weak_type=*/false, /*committed=*/false,
+      /*skip_checks=*/true);
 }
 
 namespace {