diff --git a/.github/workflows/glt-ci.yml b/.github/workflows/glt-ci.yml index 2eb3fe25..4e94fbee 100644 --- a/.github/workflows/glt-ci.yml +++ b/.github/workflows/glt-ci.yml @@ -23,8 +23,8 @@ env: jobs: run-glt-unittests: - runs-on: self-hosted - if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }} + runs-on: glt-gpu-instances + if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }} steps: - name: Checkout Code uses: actions/checkout@v3 diff --git a/.github/workflows/glt-v6d-ci.yml b/.github/workflows/glt-v6d-ci.yml index bad618f4..735cacc9 100644 --- a/.github/workflows/glt-v6d-ci.yml +++ b/.github/workflows/glt-v6d-ci.yml @@ -23,8 +23,8 @@ env: jobs: run-glt-v6d-unittests: - runs-on: self-hosted - if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }} + runs-on: glt-gpu-instances + if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }} steps: - name: Checkout Code uses: actions/checkout@v3 diff --git a/.github/workflows/manylinux-cd.yml b/.github/workflows/manylinux-cd.yml index 099a7bf8..ca53f61d 100644 --- a/.github/workflows/manylinux-cd.yml +++ b/.github/workflows/manylinux-cd.yml @@ -12,8 +12,8 @@ env: jobs: build: - runs-on: self-hosted - if: ${{ github.repository == 'alibaba/graphlearn-for-pytorch' }} + runs-on: ubuntu-latest + if: ${{ github.repository == 'snapchat/graphlearn-for-pytorch' }} steps: - name: Checkout Code uses: actions/checkout@v3 diff --git a/test/python/dist_test_utils.py b/test/python/dist_test_utils.py index cfb883e6..70cc3a1e 100644 --- a/test/python/dist_test_utils.py +++ b/test/python/dist_test_utils.py @@ -35,7 +35,7 @@ # fixed sampling options sampling_nprocs = 2 -device_num = 2 +device_num = 1 def _prepare_dataset(rank: int, @@ -110,8 +110,7 @@ def _prepare_dataset(rank: int, weighted_graph = glt.data.Graph(weighted_csr_topo, 'CPU') # feature - device_group_list = [glt.data.DeviceGroup(0, [0]), - glt.data.DeviceGroup(1, [1])] + device_group_list = [glt.data.DeviceGroup(0, [0])] split_ratio = 0.2 nfeat = torch.tensor(nodes, dtype=torch.float32).unsqueeze(1).repeat(1, 512) @@ -229,8 +228,7 @@ def _prepare_hetero_dataset( } # feature - device_group_list = [glt.data.DeviceGroup(0, [0]), - glt.data.DeviceGroup(1, [1])] + device_group_list = [glt.data.DeviceGroup(0, [0])] split_ratio = 0.2 user_nfeat = rank + torch.zeros(len(user_nodes), 512, dtype=torch.float32) diff --git a/test/python/test_dist_feature.py b/test/python/test_dist_feature.py index 32306dc6..d44bc2dd 100644 --- a/test/python/test_dist_feature.py +++ b/test/python/test_dist_feature.py @@ -27,7 +27,7 @@ def run_dist_feature_test(world_size: int, rank: int, feature: glt.data.Feature, partition2workers = glt.distributed.rpc_sync_data_partitions(world_size, rank) rpc_router = glt.distributed.RpcDataPartitionRouter(partition2workers) - current_device = torch.device('cuda', rank % 2) + current_device = torch.device('cuda', 0) dist_feature = glt.distributed.DistFeature( world_size, rank, feature, partition_book, @@ -74,9 +74,15 @@ def test_dist_feature_lookup(self): ]) partition_book.share_memory_() + # device_group_list = [ + # glt.data.DeviceGroup(0, [0]), + # glt.data.DeviceGroup(1, [1]) + # ] + # TODO(kmonte): Swap back to using real device_group_list when we have + # a way to run tests on multiple GPUs. + device_group_list = None device_group_list = [ glt.data.DeviceGroup(0, [0]), - glt.data.DeviceGroup(1, [1]) ] split_ratio = 0.8 diff --git a/test/python/test_dist_link_loader.py b/test/python/test_dist_link_loader.py index d0a8fb92..3a751a3f 100644 --- a/test/python/test_dist_link_loader.py +++ b/test/python/test_dist_link_loader.py @@ -23,6 +23,9 @@ from dist_test_utils import _prepare_dataset, _prepare_hetero_dataset from parameterized import parameterized + +device_num = 1 + def _check_sample_result(data, edge_dir='out'): tc = unittest.TestCase() @@ -221,7 +224,7 @@ def run_test_as_worker(world_size: int, rank: int, else: worker_options = glt.distributed.MpDistSamplingWorkerOptions( num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -241,7 +244,7 @@ def run_test_as_worker(world_size: int, rank: int, with_edge=True, edge_dir=edge_dir, collect_features=True, - to_device=torch.device('cuda', rank % device_num), + to_device=torch.device('cuda', 0), worker_options=worker_options ) @@ -297,7 +300,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, options = glt.distributed.RemoteDistSamplingWorkerOptions( server_rank=target_server_rank, num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -317,7 +320,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, with_edge=True, edge_dir='out', collect_features=True, - to_device=torch.device('cuda', client_rank % device_num), + to_device=torch.device('cuda', 0), worker_options=options ) diff --git a/test/python/test_dist_neighbor_loader.py b/test/python/test_dist_neighbor_loader.py index 480e693e..4219fed3 100644 --- a/test/python/test_dist_neighbor_loader.py +++ b/test/python/test_dist_neighbor_loader.py @@ -176,7 +176,7 @@ def run_test_as_worker(world_size: int, rank: int, else: worker_options = glt.distributed.MpDistSamplingWorkerOptions( num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -195,7 +195,7 @@ def run_test_as_worker(world_size: int, rank: int, with_edge=True, edge_dir=edge_dir, collect_features=True, - to_device=torch.device('cuda', rank % device_num), + to_device=torch.device('cuda', 0), worker_options=worker_options ) @@ -254,7 +254,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, ser # Automatically assign server_rank (server_rank_list) if server_rank (server_rank_list) is None server_rank=server_rank, num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -274,7 +274,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, ser with_edge=True, edge_dir=edge_dir, collect_features=True, - to_device=torch.device('cuda', client_rank % device_num), + to_device=torch.device('cuda', 0), worker_options=options ) diff --git a/test/python/test_dist_subgraph_loader.py b/test/python/test_dist_subgraph_loader.py index 380baf26..e48eb80b 100644 --- a/test/python/test_dist_subgraph_loader.py +++ b/test/python/test_dist_subgraph_loader.py @@ -21,7 +21,7 @@ # sampling options sampling_nprocs = 2 -device_num = 2 +device_num = 1 def _prepare_dataset(rank: int): """ @@ -87,12 +87,12 @@ def _check_sample_result(data, rank): true_edge_id = torch.tensor([0, 1, 2, 3, 4, 5, 9, 12, 13, 14, 16], device='cuda:0') true_mapping = torch.tensor([0, 2, 5], device='cuda:0') else: - true_node = torch.tensor([0, 1, 3, 5, 6, 7], device='cuda:1') + true_node = torch.tensor([0, 1, 3, 5, 6, 7], device='cuda:0') true_edge_index = torch.tensor([[0, 3, 0, 5, 0, 1, 5, 1, 2, 4, 3], [3, 3, 4, 5, 0, 0, 0, 1, 1, 1, 2]], - device='cuda:1') - true_edge_id = torch.tensor([12, 13, 14, 16, 0, 1, 2, 3, 4, 5, 9], device='cuda:1') - true_mapping = torch.tensor([0, 2, 5], device='cuda:1') + device='cuda:0') + true_edge_id = torch.tensor([12, 13, 14, 16, 0, 1, 2, 3, 4, 5, 9], device='cuda:0') + true_mapping = torch.tensor([0, 2, 5], device='cuda:0') tc.assertTrue(glt.utils.tensor_equal_with_device(data.node, true_node)) tc.assertTrue(glt.utils.tensor_equal_with_device(data.edge_index, true_edge_index)) tc.assertTrue(glt.utils.tensor_equal_with_device(data.edge, true_edge_id)) @@ -141,7 +141,7 @@ def run_test_as_worker(world_size: int, rank: int, else: worker_options = glt.distributed.MpDistSamplingWorkerOptions( num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -159,7 +159,7 @@ def run_test_as_worker(world_size: int, rank: int, drop_last=False, with_edge=True, collect_features=True, - to_device=torch.device('cuda', rank % device_num), + to_device=torch.device('cuda', 0), worker_options=worker_options ) @@ -214,7 +214,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, options = glt.distributed.RemoteDistSamplingWorkerOptions( server_rank=target_server_rank, num_workers=sampling_nprocs, - worker_devices=[torch.device('cuda', i % device_num) + worker_devices=[torch.device('cuda', 0) for i in range(sampling_nprocs)], worker_concurrency=2, master_addr='localhost', @@ -232,7 +232,7 @@ def run_test_as_client(num_servers: int, num_clients: int, client_rank: int, drop_last=False, with_edge=True, collect_features=True, - to_device=torch.device('cuda', client_rank % device_num), + to_device=torch.device('cuda', 0), worker_options=options )