ml-explore
diff --git a/‎.circleci/config.yml
Lines changed: 58 additions & 40 deletions b/‎.circleci/config.yml
Lines changed: 58 additions & 40 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 10 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 10 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/CMakeLists.txt
Lines changed: 9 additions & 7 deletions b/‎mlx/backend/cuda/CMakeLists.txt
Lines changed: 9 additions & 7 deletions
diff --git a/‎mlx/backend/cuda/arange.cu
Lines changed: 55 additions & 0 deletions b/‎mlx/backend/cuda/arange.cu
Lines changed: 55 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/arg_reduce.cu
Lines changed: 12 additions & 7 deletions b/‎mlx/backend/cuda/arg_reduce.cu
Lines changed: 12 additions & 7 deletions
@@ -81,30 +81,32 @@ jobs:
             export DEBIAN_FRONTEND=noninteractive
             export NEEDRESTART_MODE=a
             sudo apt-get update
-            sudo apt-get upgrade -y
-            pip install --upgrade cmake
             sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
             sudo apt-get install openmpi-bin openmpi-common libopenmpi-dev
+            curl -LsSf https://astral.sh/uv/install.sh | sh
       - run:
           name: Install Python package
           command: |
-            pip install -e ".[dev]"
+            uv venv
+            uv pip install cmake
+            uv pip install -e ".[dev]" -v
       - run:
           name: Generate package stubs
           command: |
-            echo "stubs"
-            pip install typing_extensions
-            python setup.py generate_stubs
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
       - run:
           name: Run Python tests
           command: |
+            source .venv/bin/activate
             python -m unittest discover python/tests -v
             mpirun --bind-to none -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
             mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
             if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
       - run:
           name: Build CPP only
           command: |
+            source .venv/bin/activate
             mkdir -p build && cd build
             cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
             make -j `nproc`
@@ -130,33 +132,30 @@ jobs:
       - run:
           name: Install dependencies
           command: |
-            brew install [email protected]
-            brew install openmpi
-            python3.9 -m venv env
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install --upgrade cmake
-            pip install nanobind==2.4.0
-            pip install numpy
-            pip install torch
-            pip install tensorflow
-            pip install unittest-xml-reporting
+            HOMEBREW_NO_AUTO_UPDATE=1 HOMEBREW_NO_INSTALL_CLEANUP=1 \
+              brew install openmpi uv
       - run:
           name: Install Python package
           command: |
-            source env/bin/activate
+            uv venv --python 3.9
+            uv pip install \
+              nanobind==2.4.0 \
+              cmake \
+              numpy \
+              torch \
+              tensorflow \
+              unittest-xml-reporting
             DEBUG=1 CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON" \
-              pip install -e . -v
+              uv pip install -e . -v
       - run:
           name: Generate package stubs
           command: |
-            source env/bin/activate
-            pip install typing_extensions
-            python setup.py generate_stubs
+            uv pip install typing_extensions
+            uv run --no-project setup.py generate_stubs
       - run:
           name: Run Python tests
           command: |
-            source env/bin/activate
+            source .venv/bin/activate
             LOW_MEMORY=1 DEVICE=cpu python -m xmlrunner discover -v python/tests -o test-results/cpu
             LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m xmlrunner discover -v python/tests -o test-results/gpu
             mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
@@ -165,16 +164,15 @@ jobs:
       - run:
           name: Build example extension
           command: |
-            source env/bin/activate
             cd examples/extensions
-            pip install -r requirements.txt
-            python setup.py build_ext -j8
+            uv pip install -r requirements.txt
+            uv run --no-project setup.py build_ext -j8
       - store_test_results:
           path: test-results
       - run:
           name: Build CPP only
           command: |
-            source env/bin/activate
+            source .venv/bin/activate
             mkdir -p build && cd build && cmake .. && make -j `sysctl -n hw.ncpu`
       - run:
           name: Run CPP tests
@@ -183,7 +181,7 @@ jobs:
       - run:
           name: Build small binary
           command: |
-            source env/bin/activate
+            source .venv/bin/activate
             cd build/
             cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
               -DBUILD_SHARED_LIBS=ON \
@@ -195,12 +193,13 @@ jobs:
       - run:
           name: Run Python tests with JIT
           command: |
-            source env/bin/activate
             CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
-              pip install -e . -v
+              uv pip install -e .
             LOW_MEMORY=1 DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 \
               METAL_DEBUG_ERROR_MODE=0 \
-              python -m xmlrunner discover -v python/tests -o test-results/gpu_jit
+              uv run --no-project python -m xmlrunner discover \
+                -v python/tests \
+                -o test-results/gpu_jit
 
   cuda_build_and_test:
     parameters:
@@ -212,22 +211,42 @@ jobs:
       resource_class: gpu.nvidia.small.gen2
     steps:
       - checkout
+      - restore_cache:
+          keys:
+            - cuda-<< parameters.image_date >>-{{ arch }}-
       - run:
-          name: Install Python package
+          name: Install dependencies
           command: |
             sudo apt-get update
             sudo apt-get install libcudnn9-dev-cuda-12
             sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
-            python3 -m venv env
-            source env/bin/activate
+            curl -sL https://github.com/ccache/ccache/releases/download/v4.11.3/ccache-4.11.3-linux-x86_64.tar.xz | tar xJf -
+            sudo mv ccache-4.11.3-linux-x86_64/ccache /usr/bin/ccache
+            rm -rf ccache-4.11.3-linux-x86_64
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+      - run:
+          name: Install Python package
+          command: |
+            uv venv
             CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
-              pip install -e ".[dev]"
+              uv pip install -e ".[dev]" -v
       - run:
           name: Run Python tests
           command: |
-            source env/bin/activate
+            source .venv/bin/activate
             LOW_MEMORY=1 DEVICE=cpu python -m unittest discover python/tests -v
             LOW_MEMORY=1 DEVICE=gpu python -m tests discover python/tests -v
+      - run:
+          name: CCache report
+          command: |
+            ccache --show-stats
+            ccache --zero-stats
+            ccache --max-size 400MB
+            ccache --cleanup
+      - save_cache:
+          key: cuda-<< parameters.image_date >>-{{ arch }}-{{ epoch }}
+          paths:
+            - /home/circleci/.cache/ccache
 
   build_release:
     parameters:
@@ -323,14 +342,10 @@ jobs:
             export DEBIAN_FRONTEND=noninteractive
             export NEEDRESTART_MODE=a
             sudo apt-get update
-            sudo apt-get upgrade -y
             TZ=Etc/UTC sudo apt-get -y install tzdata
-            sudo apt-get install -y apt-utils
-            sudo apt-get install -y software-properties-common
             sudo add-apt-repository -y ppa:deadsnakes/ppa
             sudo apt-get install -y $PYTHON $PYTHON-dev $PYTHON-full
             sudo apt-get install -y libblas-dev liblapack-dev liblapacke-dev
-            sudo apt-get install -y build-essential git
             $PYTHON -m venv env
             source env/bin/activate
             pip install --upgrade pip
@@ -555,6 +570,9 @@ workflows:
           requires: [ hold ]
       - cuda_build_and_test:
           requires: [ hold ]
+          matrix:
+            parameters:
+              image_date: ["2023.11.1", "2025.05.1"]
   nightly_build:
     when:
       and:
 
@@ -41,6 +41,7 @@ option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
 option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
 option(MLX_BUILD_BLAS_FROM_SOURCE "Build OpenBLAS from source code" OFF)
 option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
+option(MLX_USE_CCACHE "Use CCache for compilation cache when available" ON)
 option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
 
 # --------------------- Processor tests -------------------------
@@ -68,6 +69,15 @@ else()
   set(MLX_BUILD_METAL OFF)
 endif()
 
+if(MLX_USE_CCACHE)
+  find_program(CCACHE_PROGRAM ccache)
+  if(CCACHE_PROGRAM)
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  endif()
+endif()
+
 # ----------------------------- Lib -----------------------------
 
 include(FetchContent)
 
@@ -6,6 +6,7 @@
 target_sources(
   mlx
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/binary.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
@@ -29,7 +30,7 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
@@ -45,7 +46,8 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/unary.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
@@ -105,11 +107,11 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
     mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
 endif()
 
-# Compute capability 7 is required for synchronization between CPU/GPU with
-# managed memory. TODO: Add more architectures for potential performance gain.
-set(MLX_CUDA_ARCHITECTURES
-    "70;80"
-    CACHE STRING "CUDA architectures")
+# Compute capability >= 7.0 is required for synchronization between CPU/GPU with
+# managed memory.
+if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
+  set(MLX_CUDA_ARCHITECTURES "native")
+endif()
 message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
                                      "${MLX_CUDA_ARCHITECTURES}")
 
@@ -0,0 +1,55 @@
+// Copyright © 2025 Apple Inc.
+
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/device/fp16_math.cuh"
+#include "mlx/backend/cuda/kernel_utils.cuh"
+#include "mlx/dtype_utils.h"
+#include "mlx/primitives.h"
+
+#include <nvtx3/nvtx3.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+namespace mlx::core {
+
+namespace cu {
+
+template <typename T>
+struct Arange {
+  const T start;
+  const T step;
+
+  __device__ T operator()(uint32_t i) const {
+    return start + i * step;
+  }
+};
+
+} // namespace cu
+
+void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Arange::eval_gpu");
+  if (out.size() == 0) {
+    return;
+  }
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  auto& encoder = cu::get_command_encoder(stream());
+  encoder.set_output_array(out);
+
+  auto capture = encoder.capture_context();
+  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
+    using CTYPE = MLX_GET_TYPE(type_tag);
+    using OutType = cuda_type_t<CTYPE>;
+    CTYPE step =
+        static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_);
+    thrust::transform(
+        cu::thrust_policy(encoder.stream()),
+        thrust::counting_iterator<uint32_t>(0),
+        thrust::counting_iterator<uint32_t>(out.data_size()),
+        thrust::device_pointer_cast(out.data<OutType>()),
+        cu::Arange<OutType>{
+            static_cast<OutType>(start_), static_cast<OutType>(step)});
+  });
+}
+
+} // namespace mlx::core
@@ -44,8 +44,11 @@ struct ArgMin {
   }
 
   template <int N>
-  __device__ IndexValPair<T>
-  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+  __device__ IndexValPair<T> reduce_many(
+      IndexValPair<T> best,
+      const AlignedVector<T, N>& vals,
+      uint32_t offset) {
+#pragma unroll
     for (int i = 0; i < N; i++) {
       if (vals[i] < best.val) {
         best.val = vals[i];
@@ -74,8 +77,11 @@ struct ArgMax {
   }
 
   template <int N>
-  __device__ IndexValPair<T>
-  reduce_many(IndexValPair<T> best, T (&vals)[N], uint32_t offset) {
+  __device__ IndexValPair<T> reduce_many(
+      IndexValPair<T> best,
+      const AlignedVector<T, N>& vals,
+      uint32_t offset) {
+#pragma unroll
     for (int i = 0; i < N; i++) {
       if (vals[i] > best.val) {
         best.val = vals[i];
@@ -106,16 +112,15 @@ __global__ void arg_reduce_general(
 
   int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
   int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
+  in += in_idx;
 
   Op op;
   T init = op.init();
   IndexValPair<T> best{0, init};
 
   for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
-    T vals[N_READS];
     auto tid = r * BLOCK_DIM + block.thread_index().x;
-    cub::LoadDirectBlocked(
-        tid, StridedIterator(in + in_idx, axis_stride), vals, axis_size, init);
+    auto vals = load_vector<N_READS>(in, tid, axis_size, axis_stride, init);
     best = op.reduce_many(best, vals, tid * N_READS);
   }