diff --git a/.github/workflows/glt-ci.yml b/.github/workflows/glt-ci.yml
index 2eb3fe25..4e94fbee 100644
--- a/.github/workflows/glt-ci.yml
+++ b/.github/workflows/glt-ci.yml
@@ -23,8 +23,8 @@ env:
 
 jobs:
   run-glt-unittests:
-    runs-on: self-hosted
-    if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }}
+    runs-on: glt-gpu-instances
+    if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }}
     steps:
       - name: Checkout Code
         uses: actions/checkout@v3
diff --git a/.github/workflows/glt-v6d-ci.yml b/.github/workflows/glt-v6d-ci.yml
index bad618f4..735cacc9 100644
--- a/.github/workflows/glt-v6d-ci.yml
+++ b/.github/workflows/glt-v6d-ci.yml
@@ -23,8 +23,8 @@ env:
 
 jobs:
   run-glt-v6d-unittests:
-    runs-on: self-hosted
-    if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }}
+    runs-on: glt-gpu-instances
+    if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }}
     steps:
       - name: Checkout Code
         uses: actions/checkout@v3
diff --git a/.github/workflows/manylinux-cd.yml b/.github/workflows/manylinux-cd.yml
index 099a7bf8..ca53f61d 100644
--- a/.github/workflows/manylinux-cd.yml
+++ b/.github/workflows/manylinux-cd.yml
@@ -12,8 +12,8 @@ env:
 
 jobs:
   build:
-    runs-on: self-hosted
-    if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }}
+    runs-on: ubuntu-latest
+    if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }}
     steps:
       - name: Checkout Code
         uses: actions/checkout@v3
diff --git a/test/python/dist_test_utils.py b/test/python/dist_test_utils.py
index cfb883e6..70cc3a1e 100644
--- a/test/python/dist_test_utils.py
+++ b/test/python/dist_test_utils.py
@@ -35,7 +35,7 @@
 
 # fixed sampling options
 sampling_nprocs = 2
-device_num = 2
+device_num = 1
 
 
 def _prepare_dataset(rank: int, 
@@ -110,8 +110,7 @@ def _prepare_dataset(rank: int,
   weighted_graph = glt.data.Graph(weighted_csr_topo, 'CPU')
 
   # feature
-  device_group_list = [glt.data.DeviceGroup(0, [0]),
-                       glt.data.DeviceGroup(1, [1])]
+  device_group_list = [glt.data.DeviceGroup(0, [0])]
   split_ratio = 0.2
 
   nfeat = torch.tensor(nodes, dtype=torch.float32).unsqueeze(1).repeat(1, 512)
@@ -229,8 +228,7 @@ def _prepare_hetero_dataset(
   }
 
   # feature
-  device_group_list = [glt.data.DeviceGroup(0, [0]),
-                       glt.data.DeviceGroup(1, [1])]
+  device_group_list = [glt.data.DeviceGroup(0, [0])]
   split_ratio = 0.2
 
   user_nfeat = rank + torch.zeros(len(user_nodes), 512, dtype=torch.float32)
diff --git a/test/python/test_dist_feature.py b/test/python/test_dist_feature.py
index 32306dc6..d44bc2dd 100644
--- a/test/python/test_dist_feature.py
+++ b/test/python/test_dist_feature.py
@@ -27,7 +27,7 @@ def run_dist_feature_test(world_size: int, rank: int, feature: glt.data.Feature,
   partition2workers = glt.distributed.rpc_sync_data_partitions(world_size, rank)
   rpc_router = glt.distributed.RpcDataPartitionRouter(partition2workers)
 
-  current_device = torch.device('cuda', rank % 2)
+  current_device = torch.device('cuda', 0)
 
   dist_feature = glt.distributed.DistFeature(
     world_size, rank, feature, partition_book,
@@ -74,9 +74,15 @@ def test_dist_feature_lookup(self):
     ])
     partition_book.share_memory_()
 
+    # device_group_list = [
+    #   glt.data.DeviceGroup(0, [0]),
+    #   glt.data.DeviceGroup(1, [1])
+    # ]
+    # TODO(kmonte): Swap back to using real device_group_list when we have
+    # a way to run tests on multiple GPUs.
+    device_group_list = None
     device_group_list = [
       glt.data.DeviceGroup(0, [0]),
-      glt.data.DeviceGroup(1, [1])
     ]
 
     split_ratio = 0.8
diff --git a/test/python/test_dist_link_loader.py b/test/python/test_dist_link_loader.py
index d0a8fb92..3a751a3f 100644
--- a/test/python/test_dist_link_loader.py
+++ b/test/python/test_dist_link_loader.py
@@ -23,6 +23,9 @@
 from dist_test_utils import _prepare_dataset, _prepare_hetero_dataset
 from parameterized import parameterized
 
+
+device_num = 1
+
 def _check_sample_result(data, edge_dir='out'):
   tc = unittest.TestCase()
 
@@ -221,7 +224,7 @@ def run_test_as_worker(world_size: int, rank: int,
   else:
     worker_options = glt.distributed.MpDistSamplingWorkerOptions(
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -241,7 +244,7 @@ def run_test_as_worker(world_size: int, rank: int,
     with_edge=True,
     edge_dir=edge_dir,
     collect_features=True,
-    to_device=torch.device('cuda', rank % device_num),
+    to_device=torch.device('cuda', 0),
     worker_options=worker_options
   )
 
@@ -297,7 +300,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int,
     options = glt.distributed.RemoteDistSamplingWorkerOptions(
       server_rank=target_server_rank,
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -317,7 +320,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int,
       with_edge=True,
       edge_dir='out',
       collect_features=True,
-      to_device=torch.device('cuda', client_rank % device_num),
+      to_device=torch.device('cuda', 0),
       worker_options=options
     )
 
diff --git a/test/python/test_dist_neighbor_loader.py b/test/python/test_dist_neighbor_loader.py
index 480e693e..4219fed3 100644
--- a/test/python/test_dist_neighbor_loader.py
+++ b/test/python/test_dist_neighbor_loader.py
@@ -176,7 +176,7 @@ def run_test_as_worker(world_size: int, rank: int,
   else:
     worker_options = glt.distributed.MpDistSamplingWorkerOptions(
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -195,7 +195,7 @@ def run_test_as_worker(world_size: int, rank: int,
     with_edge=True,
     edge_dir=edge_dir,
     collect_features=True,
-    to_device=torch.device('cuda', rank % device_num),
+    to_device=torch.device('cuda', 0),
     worker_options=worker_options
   )
 
@@ -254,7 +254,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, ser
       # Automatically assign server_rank (server_rank_list) if server_rank (server_rank_list) is None
       server_rank=server_rank,
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -274,7 +274,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, ser
       with_edge=True,
       edge_dir=edge_dir,
       collect_features=True,
-      to_device=torch.device('cuda', client_rank % device_num),
+      to_device=torch.device('cuda', 0),
       worker_options=options
     )
 
diff --git a/test/python/test_dist_subgraph_loader.py b/test/python/test_dist_subgraph_loader.py
index 380baf26..e48eb80b 100644
--- a/test/python/test_dist_subgraph_loader.py
+++ b/test/python/test_dist_subgraph_loader.py
@@ -21,7 +21,7 @@
 
 # sampling options
 sampling_nprocs = 2
-device_num = 2
+device_num = 1
 
 def _prepare_dataset(rank: int):
   """
@@ -87,12 +87,12 @@ def _check_sample_result(data, rank):
     true_edge_id = torch.tensor([0, 1, 2, 3, 4, 5, 9, 12, 13, 14, 16], device='cuda:0')
     true_mapping = torch.tensor([0, 2, 5], device='cuda:0')
   else:
-    true_node = torch.tensor([0, 1, 3, 5, 6, 7], device='cuda:1')
+    true_node = torch.tensor([0, 1, 3, 5, 6, 7], device='cuda:0')
     true_edge_index = torch.tensor([[0, 3, 0, 5, 0, 1, 5, 1, 2, 4, 3],
                                     [3, 3, 4, 5, 0, 0, 0, 1, 1, 1, 2]],
-                                    device='cuda:1')
-    true_edge_id = torch.tensor([12, 13, 14, 16, 0, 1, 2, 3, 4, 5, 9], device='cuda:1')
-    true_mapping = torch.tensor([0, 2, 5], device='cuda:1')
+                                    device='cuda:0')
+    true_edge_id = torch.tensor([12, 13, 14, 16, 0, 1, 2, 3, 4, 5, 9], device='cuda:0')
+    true_mapping = torch.tensor([0, 2, 5], device='cuda:0')
   tc.assertTrue(glt.utils.tensor_equal_with_device(data.node, true_node))
   tc.assertTrue(glt.utils.tensor_equal_with_device(data.edge_index, true_edge_index))
   tc.assertTrue(glt.utils.tensor_equal_with_device(data.edge, true_edge_id))
@@ -141,7 +141,7 @@ def run_test_as_worker(world_size: int, rank: int,
   else:
     worker_options = glt.distributed.MpDistSamplingWorkerOptions(
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -159,7 +159,7 @@ def run_test_as_worker(world_size: int, rank: int,
     drop_last=False,
     with_edge=True,
     collect_features=True,
-    to_device=torch.device('cuda', rank % device_num),
+    to_device=torch.device('cuda', 0),
     worker_options=worker_options
   )
 
@@ -214,7 +214,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int,
     options = glt.distributed.RemoteDistSamplingWorkerOptions(
       server_rank=target_server_rank,
       num_workers=sampling_nprocs,
-      worker_devices=[torch.device('cuda', i % device_num)
+      worker_devices=[torch.device('cuda', 0)
                       for i in range(sampling_nprocs)],
       worker_concurrency=2,
       master_addr='localhost',
@@ -232,7 +232,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int,
       drop_last=False,
       with_edge=True,
       collect_features=True,
-      to_device=torch.device('cuda', client_rank % device_num),
+      to_device=torch.device('cuda', 0),
       worker_options=options
     )