initial minimal xla support

2024-11-17 22:45:50 +00:00 · 2024-11-17 22:45:50 +00:00 · e5182dd427
parent d9f90965c8
commit e5182dd427
4 changed files with 65 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -200,6 +200,23 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

+#### TPU/XLA Devices
+Users with TPU/XLA devices can install the PyTorch XLA stable build with the following command:
+
+```pip install torch~=2.5.0 torch_xla[tpu]~=2.5.0 -f https://storage.googleapis.com/libtpu-releases/index.html```
+
+This is the command to install the nightly 2.6.0 which might have some performance improvements:
+
+```
+pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html
+```
+
+
+To get memory info for TPU devices, install the [tpu-info](https://github.com/AI-Hypercomputer/cloud-accelerator-diagnostics/tree/main/tpu_info) package with the following command:
+
+```pip install tpu-info```
+
 # Running

 ```python main.py```
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -138,6 +138,8 @@ parser.add_argument("--multi-user", action="store_true", help="Enables per-user

 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')

+parser.add_argument("--xla", action="store_true", help="To use the XLA devices for everything.")
+
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"

--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -36,6 +36,7 @@ class CPUState(Enum):
    GPU = 0
    CPU = 1
    MPS = 2
+    XLA = 3

 # Determine VRAM State
 vram_state = VRAMState.NORMAL_VRAM
@ -84,9 +85,28 @@ try:
 except:
    pass

+try:
+    if args.xla:
+        import torch_xla as xla
+        import torch_xla.core.xla_model as xm
+        cpu_state = CPUState.XLA
+    
+except ImportError:
+    logging.error("XLA not available, please install pytorch-xla")
+    pass
+
 if args.cpu:
    cpu_state = CPUState.CPU

+
+def get_xla_memory_info(dev):
+    # xm.get_memory_info(dev) only has bytes_limit and bytes_used
+    mem_info = xm.get_memory_info(dev)
+    mem_reserved = mem_info["bytes_used"]
+    mem_total = mem_info["bytes_limit"]
+    return (mem_reserved, mem_total)
+
+
 def is_intel_xpu():
    global cpu_state
    global xpu_available
@ -105,6 +125,8 @@ def get_torch_device():
        return torch.device("mps")
    if cpu_state == CPUState.CPU:
        return torch.device("cpu")
+    if cpu_state == CPUState.XLA:
+        return xla.device()
    else:
        if is_intel_xpu():
            return torch.device("xpu", torch.xpu.current_device())
@ -128,6 +150,8 @@ def get_total_memory(dev=None, torch_total_too=False):
            mem_reserved = stats['reserved_bytes.all.current']
            mem_total_torch = mem_reserved
            mem_total = torch.xpu.get_device_properties(dev).total_memory
+        elif cpu_state == CPUState.XLA:
+            mem_total_torch, mem_total = get_xla_memory_info(dev)
        else:
            stats = torch.cuda.memory_stats(dev)
            mem_reserved = stats['reserved_bytes.all.current']
@ -241,7 +265,7 @@ if lowvram_available:
        vram_state = set_vram_to


-if cpu_state != CPUState.GPU:
+if cpu_state != CPUState.GPU and cpu_state != CPUState.XLA:
    vram_state = VRAMState.DISABLED

 if cpu_state == CPUState.MPS:
@ -924,6 +948,10 @@ def get_free_memory(dev=None, torch_free_too=False):
            mem_free_torch = mem_reserved - mem_active
            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
            mem_free_total = mem_free_xpu + mem_free_torch
+        elif cpu_state == CPUState.XLA:
+            mem_reserved, mem_total = get_xla_memory_info(dev)
+            mem_free_total = mem_total - mem_reserved
+            mem_free_torch = mem_free_total
        else:
            stats = torch.cuda.memory_stats(dev)
            mem_active = stats['active_bytes.all.current']
@ -937,6 +965,10 @@ def get_free_memory(dev=None, torch_free_too=False):
    else:
        return mem_free_total

+def xla_mode():
+    global cpu_state
+    return cpu_state == CPUState.XLA
+
 def cpu_mode():
    global cpu_state
    return cpu_state == CPUState.CPU
@ -985,6 +1017,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma

    if cpu_mode():
        return False
+    
+    if xla_mode():
+        return True

    if is_intel_xpu():
        return True
@ -1044,6 +1079,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma

    if cpu_mode():
        return False
+    
+    if xla_mode():
+        return True

    if is_intel_xpu():
        return True
@ -1062,6 +1100,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    return False

 def supports_fp8_compute(device=None):
+    if xla_mode():
+        return False
+    
    if not is_nvidia():
        return False

@ -1101,7 +1142,7 @@ def resolve_lowvram_weight(weight, model, key): #TODO: remove
    print("WARNING: The comfy.model_management.resolve_lowvram_weight function will be removed soon, please stop using it.")
    return weight

-#TODO: might be cleaner to put this somewhere else
+# TODO: might be cleaner to put this somewhere else
 import threading

 class InterruptProcessingException(Exception):
--- a/latent_preview.py
+++ b/latent_preview.py
@ -102,6 +102,9 @@ def prepare_callback(model, steps, x0_output_dict=None):
        preview_bytes = None
        if previewer:
            preview_bytes = previewer.decode_latent_to_preview_image(preview_format, x0)
+        if args.xla:
+            import torch_xla as xla
+            xla.sync()
        pbar.update_absolute(step + 1, total_steps, preview_bytes)
    return callback