lm-sys · merrymercy · Jul 23, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023
diff --git a/fastchat/model/compression.py b/fastchat/model/compression.py
@@ -165,6 +165,8 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
             tensor = None
             gc.collect()
             torch.cuda.empty_cache()
+            if device == "xpu":
+                torch.xpu.empty_cache()
 
     for name in model.state_dict():
         if name not in linear_weights:

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
@@ -239,13 +239,14 @@ def load_model(
     adapter = get_model_adapter(model_path)
     model, tokenizer = adapter.load_model(model_path, kwargs)
 
-    if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device == "mps":
+    if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
+        "mps",
+        "xpu",
+    ):
         model.to(device)
 
-    elif device == "xpu":
-        model.eval()
-        model = model.to("xpu")
-        model = torch.xpu.optimize(model, dtype=torch.bfloat16, inplace=True)
+    if device == "xpu":
+        model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
 
     if debug:
         print(model)

diff --git a/fastchat/model/model_codet5p.py b/fastchat/model/model_codet5p.py
@@ -104,3 +104,5 @@ def __call__(
     # clean
     gc.collect()
     torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
diff --git a/fastchat/model/model_falcon.py b/fastchat/model/model_falcon.py
@@ -136,3 +136,5 @@ def generate_stream_falcon(
     # clean
     gc.collect()
     torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
@@ -177,6 +177,7 @@ def main(args):
                 f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
+        os.environ["XPU_VISIBLE_DEVICES"] = args.gpus
 
     if args.style == "simple":
         chatio = SimpleChatIO(args.multiline)

diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
@@ -253,6 +253,8 @@ def generate_stream(
     del past_key_values, out
     gc.collect()
     torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
 
 
 class ChatIO(abc.ABC):