Skip to content

Commit b376e8c

Browse files
authored
(torchx/specs) Allow roles to specify their own workspaces
Differential Revision: D83793199 Pull Request resolved: #1139
1 parent afa6bec commit b376e8c

File tree

7 files changed

+279
-178
lines changed

7 files changed

+279
-178
lines changed

torchx/runner/api.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -426,26 +426,42 @@ def dryrun(
426426

427427
sched._pre_build_validate(app, scheduler, resolved_cfg)
428428

429-
if workspace and isinstance(sched, WorkspaceMixin):
430-
role = app.roles[0]
431-
old_img = role.image
432-
433-
logger.info(f"Checking for changes in workspace `{workspace}`...")
434-
logger.info(
435-
'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
436-
)
437-
sched.build_workspace_and_update_role2(role, workspace, resolved_cfg)
438-
439-
if old_img != role.image:
440-
logger.info(
441-
f"Built new image `{role.image}` based on original image `{old_img}`"
442-
f" and changes in workspace `{workspace}` for role[0]={role.name}."
443-
)
444-
else:
445-
logger.info(
446-
f"Reusing original image `{old_img}` for role[0]={role.name}."
447-
" Either a patch was built or no changes to workspace was detected."
448-
)
429+
if isinstance(sched, WorkspaceMixin):
430+
for i, role in enumerate(app.roles):
431+
role_workspace = role.workspace
432+
433+
if i == 0 and workspace:
434+
# NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
435+
# later, torchx added support for the workspace attr in Role
436+
# for BC, give precedence to the workspace argument over the workspace attr for role[0]
437+
if role_workspace:
438+
logger.info(
439+
f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
440+
" To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically." # noqa: B950
441+
)
442+
role_workspace = workspace
443+
444+
if role_workspace:
445+
old_img = role.image
446+
logger.info(
447+
f"Checking for changes in workspace `{role_workspace}` for role[{i}]={role.name}..."
448+
)
449+
# TODO kiuk@ once we deprecate the `workspace` argument in runner APIs we can simplify the signature of
450+
# build_workspace_and_update_role2() to just taking the role and resolved_cfg
451+
sched.build_workspace_and_update_role2(
452+
role, role_workspace, resolved_cfg
453+
)
454+
455+
if old_img != role.image:
456+
logger.info(
457+
f"Built new image `{role.image}` based on original image `{old_img}`"
458+
f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
459+
)
460+
else:
461+
logger.info(
462+
f"Reusing original image `{old_img}` for role[{i}]={role.name}."
463+
" Either a patch was built or no changes to workspace was detected."
464+
)
449465

450466
sched._validate(app, scheduler, resolved_cfg)
451467
dryrun_info = sched.submit_dryrun(app, resolved_cfg)

torchx/runner/test/api_test.py

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,18 @@
2020
create_scheduler,
2121
LocalDirectoryImageProvider,
2222
)
23-
from torchx.specs import AppDryRunInfo, CfgVal
24-
from torchx.specs.api import (
23+
from torchx.specs import (
2524
AppDef,
25+
AppDryRunInfo,
2626
AppHandle,
2727
AppState,
28+
CfgVal,
2829
parse_app_handle,
2930
Resource,
3031
Role,
3132
runopts,
3233
UnknownAppException,
34+
Workspace,
3335
)
3436
from torchx.specs.finder import ComponentNotFoundException
3537
from torchx.test.fixtures import TestWithTmpDir
@@ -400,6 +402,16 @@ def build_workspace_and_update_role(
400402
) -> None:
401403
if self.build_new_img:
402404
role.image = f"{role.image}_new"
405+
role.env["SRC_WORKSPACE"] = workspace
406+
407+
def create_role(image: str, workspace: str | None = None) -> Role:
408+
return Role(
409+
name="noop",
410+
image=image,
411+
resource=resource.SMALL,
412+
entrypoint="/bin/true",
413+
workspace=Workspace.from_str(workspace),
414+
)
403415

404416
with Runner(
405417
name=SESSION_NAME,
@@ -411,33 +423,71 @@ def build_workspace_and_update_role(
411423
"builds-img": lambda name, **kwargs: TestScheduler(build_new_img=True),
412424
},
413425
) as runner:
426+
app = AppDef(
427+
"ignored",
428+
roles=[create_role(image="foo"), create_role(image="bar")],
429+
)
430+
roles = runner.dryrun(
431+
app, "no-build-img", workspace="//workspace"
432+
).request.roles
433+
self.assertEqual("foo", roles[0].image)
434+
self.assertEqual("bar", roles[1].image)
435+
436+
roles = runner.dryrun(
437+
app, "builds-img", workspace="//workspace"
438+
).request.roles
439+
440+
# workspace is attached to role[0] when role[0].workspace is `None`
441+
self.assertEqual("foo_new", roles[0].image)
442+
self.assertEqual("bar", roles[1].image)
443+
444+
# now run with role[0] having workspace attribute defined
414445
app = AppDef(
415446
"ignored",
416447
roles=[
417-
Role(
418-
name="sleep",
419-
image="foo",
420-
resource=resource.SMALL,
421-
entrypoint="sleep",
422-
args=["1"],
423-
),
424-
Role(
425-
name="sleep",
426-
image="bar",
427-
resource=resource.SMALL,
428-
entrypoint="sleep",
429-
args=["1"],
430-
),
448+
create_role(image="foo", workspace="//should_be_overriden"),
449+
create_role(image="bar"),
450+
],
451+
)
452+
roles = runner.dryrun(
453+
app, "builds-img", workspace="//workspace"
454+
).request.roles
455+
# workspace argument should override role[0].workspace attribute
456+
self.assertEqual("foo_new", roles[0].image)
457+
self.assertEqual("//workspace", roles[0].env["SRC_WORKSPACE"])
458+
self.assertEqual("bar", roles[1].image)
459+
460+
# now run with both role[0] and role[1] having workspace attr
461+
app = AppDef(
462+
"ignored",
463+
roles=[
464+
create_role(image="foo", workspace="//foo"),
465+
create_role(image="bar", workspace="//bar"),
466+
],
467+
)
468+
roles = runner.dryrun(
469+
app, "builds-img", workspace="//workspace"
470+
).request.roles
471+
472+
# workspace argument should override role[0].workspace attribute
473+
self.assertEqual("foo_new", roles[0].image)
474+
self.assertEqual("//workspace", roles[0].env["SRC_WORKSPACE"])
475+
self.assertEqual("bar_new", roles[1].image)
476+
self.assertEqual("//bar", roles[1].env["SRC_WORKSPACE"])
477+
478+
# now run with both role[0] and role[1] having workspace attr but no workspace arg
479+
app = AppDef(
480+
"ignored",
481+
roles=[
482+
create_role(image="foo", workspace="//foo"),
483+
create_role(image="bar", workspace="//bar"),
431484
],
432485
)
433-
dryruninfo = runner.dryrun(app, "no-build-img", workspace="//workspace")
434-
self.assertEqual("foo", dryruninfo.request.roles[0].image)
435-
self.assertEqual("bar", dryruninfo.request.roles[1].image)
436-
437-
dryruninfo = runner.dryrun(app, "builds-img", workspace="//workspace")
438-
# workspace is attached to role[0] by default
439-
self.assertEqual("foo_new", dryruninfo.request.roles[0].image)
440-
self.assertEqual("bar", dryruninfo.request.roles[1].image)
486+
roles = runner.dryrun(app, "builds-img", workspace=None).request.roles
487+
self.assertEqual("foo_new", roles[0].image)
488+
self.assertEqual("//foo", roles[0].env["SRC_WORKSPACE"])
489+
self.assertEqual("bar_new", roles[1].image)
490+
self.assertEqual("//bar", roles[1].env["SRC_WORKSPACE"])
441491

442492
def test_describe(self, _) -> None:
443493
with self.get_runner() as runner:

torchx/specs/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
UnknownAppException,
4646
UnknownSchedulerException,
4747
VolumeMount,
48+
Workspace,
4849
)
4950
from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
5051

@@ -236,4 +237,6 @@ def gpu_x_1() -> Dict[str, Resource]:
236237
"torchx_run_args_from_json",
237238
"TorchXRunArgs",
238239
"ALL",
240+
"TORCHX_HOME",
241+
"Workspace",
239242
]

torchx/specs/api.py

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,78 @@ class DeviceMount:
350350
permissions: str = "rwm"
351351

352352

353+
@dataclass
354+
class Workspace:
355+
"""
356+
Specifies a local "workspace" (a set of directories). Workspaces are ad-hoc built
357+
into an (usually ephemeral) image. This effectively mirrors the local code changes
358+
at job submission time.
359+
360+
For example:
361+
362+
1. ``projects={"~/github/torch": "torch"}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/torch/**``
363+
2. ``projects={"~/github/torch": ""}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/**``
364+
365+
The exact location of ``$REMOTE_WORKSPACE_ROOT`` is implementation dependent and varies between
366+
different implementations of :py:class:`~torchx.workspace.api.WorkspaceMixin`.
367+
Check the scheduler documentation for details on which workspace it supports.
368+
369+
Note: ``projects`` maps the location of the local project to a sub-directory in the remote workspace root directory.
370+
Typically the local project location is a directory path (e.g. ``/home/foo/github/torch``).
371+
372+
373+
Attributes:
374+
projects: mapping of local project to the sub-dir in the remote workspace dir.
375+
"""
376+
377+
projects: dict[str, str]
378+
379+
def __bool__(self) -> bool:
380+
"""False if no projects mapping. Lets us use workspace object in an if-statement"""
381+
return bool(self.projects)
382+
383+
def is_unmapped_single_project(self) -> bool:
384+
"""
385+
Returns ``True`` if this workspace only has 1 project
386+
and its target mapping is an empty string.
387+
"""
388+
return len(self.projects) == 1 and not next(iter(self.projects.values()))
389+
390+
@staticmethod
391+
def from_str(workspace: str | None) -> "Workspace":
392+
import yaml
393+
394+
if not workspace:
395+
return Workspace({})
396+
397+
projects = yaml.safe_load(workspace)
398+
if isinstance(projects, str): # single project workspace
399+
projects = {projects: ""}
400+
else: # multi-project workspace
401+
# Replace None mappings with "" (empty string)
402+
projects = {k: ("" if v is None else v) for k, v in projects.items()}
403+
404+
return Workspace(projects)
405+
406+
def __str__(self) -> str:
407+
"""
408+
Returns a string representation of the Workspace by concatenating
409+
the project mappings using ';' as a delimiter and ':' between key and value.
410+
If the single-project workspace with no target mapping, then simply
411+
returns the src (local project dir)
412+
413+
NOTE: meant to be used for logging purposes not serde.
414+
Therefore not symmetric with :py:func:`Workspace.from_str`.
415+
416+
"""
417+
if self.is_unmapped_single_project():
418+
return next(iter(self.projects))
419+
else:
420+
return ";".join(
421+
k if not v else f"{k}:{v}" for k, v in self.projects.items()
422+
)
423+
424+
353425
@dataclass
354426
class Role:
355427
"""
@@ -402,6 +474,10 @@ class Role:
402474
metadata: Free form information that is associated with the role, for example
403475
scheduler specific data. The key should follow the pattern: ``$scheduler.$key``
404476
mounts: a list of mounts on the machine
477+
workspace: local project directories to be mirrored on the remote job.
478+
NOTE: The workspace argument provided to the :py:class:`~torchx.runner.api.Runner` APIs
479+
only takes effect on ``appdef.role[0]`` and overrides this attribute.
480+
405481
"""
406482

407483
name: str
@@ -417,9 +493,10 @@ class Role:
417493
resource: Resource = field(default_factory=_null_resource)
418494
port_map: Dict[str, int] = field(default_factory=dict)
419495
metadata: Dict[str, Any] = field(default_factory=dict)
420-
mounts: List[Union[BindMount, VolumeMount, DeviceMount]] = field(
421-
default_factory=list
422-
)
496+
mounts: List[BindMount | VolumeMount | DeviceMount] = field(default_factory=list)
497+
workspace: Workspace | None = None
498+
499+
# DEPRECATED DO NOT SET, WILL BE REMOVED SOON
423500
overrides: Dict[str, Any] = field(default_factory=dict)
424501

425502
# pyre-ignore

0 commit comments

Comments
 (0)