<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Private AI on Lean Deep Tech blog</title>
    <link>https://leandeep.com/tags/private-ai/</link>
    <description>Recent content in Private AI on Lean Deep Tech blog</description>
    <generator>Hugo</generator>
    <language>fr</language>
    <copyright>&lt;a href=&#34;https://creativecommons.org/licenses/by-nc/4.0/&#34; target=&#34;_blank&#34; rel=&#34;noopener&#34;&gt;CC BY-NC 4.0&lt;/a&gt;</copyright>
    <lastBuildDate>Sun, 26 Apr 2026 12:15:00 +0000</lastBuildDate>
    <atom:link href="https://leandeep.com/tags/private-ai/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Private AI with Gemma 4</title>
      <link>https://leandeep.com/private-ai-with-gemma-4/</link>
      <pubDate>Sun, 26 Apr 2026 12:15:00 +0000</pubDate>
      <guid>https://leandeep.com/private-ai-with-gemma-4/</guid>
      <description>&lt;h2 id=&#34;option-1&#34;&gt;Option 1&lt;/h2&gt;&#xA;&lt;p&gt;&lt;strong&gt;Prerequisites&lt;/strong&gt;&lt;/p&gt;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;Pyenv&lt;/li&gt;&#xA;&lt;li&gt;Cuda 13&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;&lt;br/&gt;&#xA;&lt;p&gt;&lt;strong&gt;Installation&lt;/strong&gt;&lt;/p&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;pyenv install 3.12.2&#xA;pyenv global 3.12.2&#xA;python --version&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;python -m venv ./vllm-gemma&#xA;source ./vllm-gemma/bin/activate&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;pip install --upgrade pip&#xA;&#xA;pip install torch --index-url https://download.pytorch.org/whl/cu130&#xA;python -c &amp;#34;import torch; print(torch.cuda.is_available())&amp;#34;&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;pip install vllm&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;pip install huggingface_hub&#xA;hf download google/gemma-4-26B-A4B-it --local-dir ~/hf_models/gemma-4-31B-it&#xA;&#xA;vllm serve google/gemma-4-26B-A4B-it --max-model-len 32768 --gpu-memory-utilization 0.90 --limit-mm-per-prompt &amp;#39;{&amp;#34;image&amp;#34;: 150, &amp;#34;video&amp;#34;: 1, &amp;#34;audio&amp;#34;: 0}--enable-prefix-cache --host 0.0.0.0 --port 8000 &#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;h2 id=&#34;option-2---docker&#34;&gt;Option 2 - Docker&lt;/h2&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;# Base CUDA 13 (ARM sbsa)&#xA;FROM nvidia/cuda:13.0.0-devel-ubuntu22.04&#xA;&#xA;ENV DEBIAN_FRONTEND=noninteractive&#xA;&#xA;ENV USE_CUDA=1&#xA;ENV USE_NCCL=1&#xA;ENV USE_DISTRIBUTED=1&#xA;&#xA;ENV VLLM_USE_FLASH_ATTENTION=0&#xA;&#xA;# --------------------------------------------------&#xA;# System deps&#xA;# --------------------------------------------------&#xA;RUN apt-get update &amp;amp;&amp;amp; apt-get install -y \&#xA;    build-essential \&#xA;    git \&#xA;    cmake \&#xA;    ninja-build \&#xA;    curl \&#xA;    wget \&#xA;    vim \&#xA;    python3 \&#xA;    python3-dev \&#xA;    python3-pip \&#xA;    python3-venv \&#xA;    libopenblas-dev \&#xA;    libssl-dev \&#xA;    zlib1g-dev \&#xA;    libffi-dev \&#xA;    libbz2-dev \&#xA;    libreadline-dev \&#xA;    libsqlite3-dev \&#xA;    libncursesw5-dev \&#xA;    xz-utils \&#xA;    tk-dev \&#xA;    libxml2-dev \&#xA;    libxmlsec1-dev \&#xA;    liblzma-dev \&#xA;    gcc-12 \&#xA;    g++-12 \&#xA;    &amp;amp;&amp;amp; rm -rf /var/lib/apt/lists/*&#xA;&#xA;# Set python&#xA;RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1&#xA;&#xA;RUN python -m pip install --upgrade pip setuptools wheel&#xA;&#xA;# --------------------------------------------------&#xA;# CUDA env&#xA;# --------------------------------------------------&#xA;ENV CUDA_HOME=/usr/local/cuda&#xA;ENV LD_LIBRARY_PATH=/usr/local/cuda/targets/sbsa-linux/lib:$LD_LIBRARY_PATH&#xA;ENV PATH=/usr/local/cuda/bin:$PATH&#xA;&#xA;# --------------------------------------------------&#xA;# Compilers&#xA;# --------------------------------------------------&#xA;ENV CC=gcc-12&#xA;ENV CXX=g++-12&#xA;&#xA;# --------------------------------------------------&#xA;# Build PyTorch from source&#xA;# --------------------------------------------------&#xA;WORKDIR /opt&#xA;&#xA;RUN git clone --recursive https://github.com/pytorch/pytorch&#xA;&#xA;WORKDIR /opt/pytorch&#xA;&#xA;# CUDA arch (à ajuster selon GPU réel)&#xA;ENV TORCH_CUDA_ARCH_LIST=&amp;#34;8.0;8.6;8.9;9.0&amp;#34;&#xA;&#xA;# Fix ARM SVE issue&#xA;ENV BUILD_IGNORE_SVE_UNAVAILABLE=1&#xA;&#xA;# Optionnel mais recommandé (évite d&amp;#39;autres warnings CPU)&#xA;ENV USE_NATIVE_ARCH=0&#xA;&#xA;# Install Python deps&#xA;RUN python -m pip install -r requirements.txt&#xA;&#xA;# Build PyTorch&#xA;RUN python setup.py develop&#xA;&#xA;# --------------------------------------------------&#xA;# Install Triton&#xA;# --------------------------------------------------&#xA;RUN python -m pip install ninja cmake&#xA;&#xA;# Try wheel first&#xA;RUN python -m pip install triton || true&#xA;&#xA;# Fallback build Triton from source&#xA;RUN if ! python -c &amp;#34;import triton&amp;#34; ; then \&#xA;      git clone https://github.com/openai/triton /opt/triton &amp;amp;&amp;amp; \&#xA;      cd /opt/triton/python &amp;amp;&amp;amp; \&#xA;      python -m pip install -e . ; \&#xA;    fi&#xA;&#xA;# --------------------------------------------------&#xA;# Build vLLM&#xA;# --------------------------------------------------&#xA;WORKDIR /opt&#xA;&#xA;RUN git clone https://github.com/vllm-project/vllm&#xA;&#xA;WORKDIR /opt/vllm&#xA;&#xA;ENV VLLM_BUILD_WITH_CUDA=1&#xA;ENV MAX_JOBS=8&#xA;&#xA;RUN python -m pip install -e .&#xA;&#xA;# --------------------------------------------------&#xA;# Runtime&#xA;# --------------------------------------------------&#xA;WORKDIR /workspace&#xA;&#xA;EXPOSE 8000&#xA;&#xA;CMD [&amp;#34;python&amp;#34;, &amp;#34;-m&amp;#34;, &amp;#34;vllm.entrypoints.openai.api_server&amp;#34;, \&#xA;     &amp;#34;--model /models/gemma&amp;#34;, \&#xA;     &amp;#34;--trust-remote-code&amp;#34;, \&#xA;     &amp;#34;--max-model-len&amp;#34;, &amp;#34;8192&amp;#34;,&#xA;     &amp;#34;--max-num-batched-tokens&amp;#34;, &amp;#34;4096&amp;#34;, \&#xA;     &amp;#34;--gpu-memory-utilization&amp;#34;, &amp;#34;0.90&amp;#34;, \&#xA;     &amp;#34;--disable-flash-attn&amp;#34;,&#xA;     &amp;#34;--enforce-eager&amp;#34;,&#xA;     &amp;#34;--host&amp;#34;, &amp;#34;0.0.0.0&amp;#34;, \&#xA;     &amp;#34;--port&amp;#34;, &amp;#34;8000&amp;#34;]&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;docker build -t vllm-cuda13 .&#xA;docker run --gpus all -p 8000:8000 \&#xA;  -v ~/hf_models/gemma-4-31B-it:/models/gemma \&#xA;  vllm-cuda13&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;p&gt;If you see something like this it means it is all good:&lt;/p&gt;</description>
    </item>
    <item>
      <title>LLM studio on Spark</title>
      <link>https://leandeep.com/llm-studio-on-spark/</link>
      <pubDate>Sun, 26 Apr 2026 07:15:00 +0000</pubDate>
      <guid>https://leandeep.com/llm-studio-on-spark/</guid>
      <description>&lt;h2 id=&#34;installation&#34;&gt;Installation&lt;/h2&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;curl -fsSL https://lmstudio.ai/install.sh | bash&#xA;&#xA;curl -L -O https://raw.githubusercontent.com/lmstudio-ai/docs/main/_assets/nvidia-spark-playbook/bash/run.sh&#xA;# Or with Python&#xA;curl -L -O https://raw.githubusercontent.com/lmstudio-ai/docs/main/_assets/nvidia-spark-playbook/py/run.py&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;h2 id=&#34;test&#34;&gt;Test&lt;/h2&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;lms server start --bind 0.0.0.0 --port 1234&#xA;&#xA;hostname -I&#xA;curl http://server_ip:1234/api/v1/models &#xA;&lt;/code&gt;&lt;/pre&gt;&lt;br/&gt;&#xA;&lt;h2 id=&#34;download-model&#34;&gt;Download model&lt;/h2&gt;&#xA;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;lms get openai/gpt-oss-120b&#xA;&lt;/code&gt;&lt;/pre&gt;&lt;pre tabindex=&#34;0&#34;&gt;&lt;code&gt;curl -LsSf https://astral.sh/uv/install.sh | sh&#xA;uv run --script run.py&#xA;&lt;/code&gt;&lt;/pre&gt;</description>
    </item>
  </channel>
</rss>
